library(here)
## here() starts at /Users/krishabugajski/git_projects/capstone-project-team-c
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(readr)
library(tibble)
library(knitr)
library(ggplot2)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(tidyr)
library(corrplot)
## corrplot 0.95 loaded
library(gt)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(caret)
## Loading required package: lattice
library(rpart)
library(rpart.plot)
library(combinat)
##
## Attaching package: 'combinat'
## The following object is masked from 'package:utils':
##
## combn
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
#library(SHAPforxgboost)
weather = read.csv(here("data","raw-data","weather-data.csv"))
airquality = read.csv(here("data","raw-data","air-quality-data.csv"))
boston = read.csv(here("data","raw-data","boston-marathon-data.csv"))
chicago = read.csv(here("data","raw-data","chicago-marathon-data.csv"))
nyc = read.csv(here("data","raw-data","nyc-marathon-data.csv"))
berlin = read.csv(here("data","raw-data","berlin-marathon-data.csv"))
Look at the raw datasets (KB)
str(boston)
## 'data.frame': 494326 obs. of 7 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : int 0 0 0 0 0 0 0 0 0 0 ...
## $ gender : chr "M" "M" "M" "M" ...
## $ official_time: chr "2:07:15" "2:07:19" "2:08:08" "2:08:09" ...
## $ overall : int 1 2 3 4 5 6 7 8 9 10 ...
## $ city : chr "Boston" "Boston" "Boston" "Boston" ...
## $ year : int 1994 1994 1994 1994 1994 1994 1994 1994 1994 1994 ...
str(berlin)
## 'data.frame': 884944 obs. of 5 variables:
## $ YEAR : int 1974 1974 1974 1974 1974 1974 1974 1974 1974 1974 ...
## $ COUNTRY: chr "" "" "" "" ...
## $ GENDER : chr "male" "male" "male" "male" ...
## $ AGE : chr "L1" "L2" "L2" "L" ...
## $ TIME : chr "02:44:53" "02:46:43" "02:48:08" "02:48:40" ...
str(nyc)
## 'data.frame': 1460286 obs. of 10 variables:
## $ Year : int 2024 2024 2024 2024 2024 2024 2024 2024 2024 2024 ...
## $ Race : chr "NYC Marathon" "NYC Marathon" "NYC Marathon" "NYC Marathon" ...
## $ Name : chr "Abdi Nageeye" "Evans Chebet" "Albert Korir" "Tamirat Tola" ...
## $ Gender : chr "M" "M" "M" "M" ...
## $ Age : int 35 35 30 33 31 27 31 30 35 31 ...
## $ State : chr "" "-0" "" "" ...
## $ Country : chr "NLD" "KEN" "KEN" "ETH" ...
## $ Overall : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Finish.Time: chr "02:07:39" "02:07:45" "02:08:00" "02:08:12" ...
## $ Finish : int 7659 7665 7680 7692 7730 7740 7761 7839 7839 7857 ...
str(chicago)
## 'data.frame': 886546 obs. of 9 variables:
## $ Race : chr "Chicago" "Chicago" "Chicago" "Chicago" ...
## $ Year : int 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
## $ Name : chr "Greg McCarter" "Marcos De Sa" "Claudio Ravera" "Mitch Auerbach" ...
## $ Gender : chr "M" "M" "M" "M" ...
## $ Age : int 59 44 44 54 39 24 34 24 39 44 ...
## $ Country : chr "USA" "USA" "ITA" "USA" ...
## $ Overall : int 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 ...
## $ Finish.Time: chr "03:25:51" "03:25:52" "03:25:52" "03:25:52" ...
## $ Finish : int 12351 12352 12352 12352 12353 12353 12353 12353 12354 12354 ...
str(weather)
## 'data.frame': 115 obs. of 11 variables:
## $ Year : int 2025 2024 2023 2022 2021 2019 2018 2017 2016 2015 ...
## $ Date : chr "10-12-2025" "10-13-2024" "10-8-2023" "10-9-2022" ...
## $ Marathon : chr "Chicago" "Chicago" "Chicago" "Chicago" ...
## $ High.Temp : int 68 67 55 71 83 55 63 80 63 79 ...
## $ Low.Temp : int 51 51 45 46 72 39 57 56 50 53 ...
## $ Day.Average.Temp : num 59.9 60.2 50.1 58 75.5 ...
## $ Precipitation : num 0 0 0 0 0 0 0.19 0.44 0 0 ...
## $ Average.Dew.Point : num 48.9 46.5 36 35.1 60.7 ...
## $ Max.Wind.Speed : int 12 23 13 14 18 23 13 21 14 18 ...
## $ Visibility : num 10 10 10 10 10 10 10 10 10 10 ...
## $ Sea.Level.Pressure: num 29.4 29.3 29.4 29.5 29.1 ...
str(airquality)
## 'data.frame': 115 obs. of 10 variables:
## $ Year : int 2025 2024 2023 2022 2021 2019 2018 2017 2016 2015 ...
## $ Date : chr "10-12-2025" "10-13-2024" "10-8-2023" "10-9-2022" ...
## $ Marathon : chr "Chicago" "Chicago" "Chicago" "Chicago" ...
## $ Overall.AQI.Value: int 42 91 43 50 72 44 48 46 67 51 ...
## $ Main.Pollutant : chr "PM2.5" "PM10" "PM2.5" "PM2.5" ...
## $ Ozone : int 41 34 27 39 46 36 21 44 37 47 ...
## $ PM10 : chr "16" "91" "16" "46" ...
## $ PM2.5 : int 42 70 43 50 72 44 48 46 67 51 ...
## $ NO2 : int NA 25 25 39 28 27 8 36 27 20 ...
## $ CO : int NA 5 3 8 6 3 3 3 5 8 ...
We can see that several of the datasets have varying variable names/formats for year, marathon, gender, and chip_time. We can also see there are variables that are not necessary in several of the marathon datasets.
Standardize all the raw marathon datasets; clean variable names and select necessary variables.
boston_clean <- boston %>%
transmute(
year = year,
marathon = "Boston",
gender = gender,
chip_time = official_time
)
berlin_clean <- berlin %>%
transmute(
year = YEAR,
marathon = "Berlin",
gender = GENDER,
chip_time = TIME
)
nyc_clean <- nyc %>%
transmute(
year = Year,
marathon = "NYC",
gender = Gender,
chip_time = Finish.Time
)
chicago_clean <- chicago %>%
transmute(
year = Year,
marathon = "Chicago",
gender = Gender,
chip_time = Finish.Time
)
Standardize the raw weather data; format the variables names to make them clean (KB)
weather_clean <- weather %>%
transmute(
year = Year,
marathon = Marathon,
high_temp = High.Temp,
low_temp = Low.Temp,
avg_temp = Day.Average.Temp,
precipitation = Precipitation,
dew_point = Average.Dew.Point,
wind_speed = Max.Wind.Speed,
visibility = Visibility,
sea_level_pressure = Sea.Level.Pressure
)
Standardize the raw airquality data; format the variables names to make them clean (KB)
airquality_clean <- airquality %>%
transmute(
year = Year,
marathon = Marathon,
aqi = Overall.AQI.Value,
main_pollutant = Main.Pollutant,
co = as.numeric(CO),
ozone = as.numeric(Ozone),
pm10 = suppressWarnings(as.numeric(PM10)),
pm25 = as.numeric(PM2.5),
no2 = as.numeric(NO2)
)
# using bind_rows so we can row-wise combine and have data for each runner
marathons_all <- bind_rows(
boston_clean,
berlin_clean,
nyc_clean,
chicago_clean
)
str(marathons_all)
## 'data.frame': 3726102 obs. of 4 variables:
## $ year : int 1994 1994 1994 1994 1994 1994 1994 1994 1994 1994 ...
## $ marathon : chr "Boston" "Boston" "Boston" "Boston" ...
## $ gender : chr "M" "M" "M" "M" ...
## $ chip_time: chr "2:07:15" "2:07:19" "2:08:08" "2:08:09" ...
We can see that marathons_all contains the identifying variables (year, marathon, gender), and the outcome variable chip_time.
Select only years we need (1996 to 2025) (KB)
marathons_all <- marathons_all %>%
filter(year >= 1996 & year <= 2025)
unique(marathons_all$year)
## [1] 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010
## [16] 2011 2012 2014 2015 2016 2017 2018 2019 2013 2024 2022 2021 2023
We can that the merged marathon data set now contains years from 1996 to 2023, as most dont have data up to 2025.
Need chip_seconds to find the average finishing times (pulled from chatgbt)
# Cleans and standardizes chip-time values by converting Excel-style numeric
# times to HH:MM:SS, trimming and formatting text times, replacing invalid
# entries with NA, and padding missing leading zeros.
clean_chip_time <- function(x) {
# Convert numeric Excel-style times to character HH:MM:SS
x_numeric <- suppressWarnings(as.numeric(x))
is_fraction <- !is.na(x_numeric) & x_numeric < 1 & x_numeric > 0
x[is_fraction] <- format(
as.POSIXct("1970-01-01", tz = "UTC") + x_numeric[is_fraction] * 86400,
"%H:%M:%S"
)
# Everything else treat as text and clean
x <- as.character(x)
x <- trimws(x)
# Replace known invalid strings with NA
invalid <- c("", "NA", "N/A", "—", "-", "DNF", "DNS", "DQ", "no time", "No Time", "NO TIME")
x[x %in% invalid] <- NA
# Pad missing zeros (H:MM:SS → HH:MM:SS, etc.)
x <- gsub("^([0-9]):", "0\\1:", x)
x <- gsub(":([0-9]):", ":0\\1:", x)
x <- gsub(":([0-9])$", ":0\\1", x)
return(x)
}
Clean chip_time; using clean_chip_time function (KB)
marathons_all <- marathons_all %>%
mutate(chip_time_clean = clean_chip_time(chip_time))
Convert the cleaned chip_time values (HH:MM:SS) to hms() and period_to_seconds() and gets a new column chip_seconds (KB)
marathons_all <- marathons_all %>%
mutate(
chip_seconds = suppressWarnings(period_to_seconds(hms(chip_time_clean)))
)
head(marathons_all)
## year marathon gender chip_time chip_time_clean chip_seconds
## 1 1996 Boston M 2:09:15 02:09:15 7755
## 2 1996 Boston M 2:09:26 02:09:26 7766
## 3 1996 Boston M 2:09:51 02:09:51 7791
## 4 1996 Boston M 2:10:03 02:10:03 7803
## 5 1996 Boston M 2:10:09 02:10:09 7809
## 6 1996 Boston M 2:10:21 02:10:21 7821
We can see chip_time with the original data,
chip_time_clean with the uniform cleaned data across all
marathons, and chip_seconds with the total time in
seconds.
Need to see what we want to do with these missing values and where they are coming from (KB)
marathons_all %>%
summarize(missing_finish_times = sum(is.na(chip_seconds)))
## missing_finish_times
## 1 3
marathons_all %>%
filter(is.na(chip_seconds))
## year marathon gender chip_time chip_time_clean chip_seconds
## 1 1996 Berlin male no time <NA> NA
## 2 1996 Berlin male no time <NA> NA
## 3 1998 Berlin male no time <NA> NA
We can see that there are only 3 rows with missing chip_times.
Remove missing finish times. This is safe because we only use average finishing times in our model, so individual missing times do not matter. Also there are only 3 total
marathons_all <- marathons_all %>%
filter(!is.na(chip_seconds))
Label as ‘male’, ‘female’, or ‘unknown’, and we decided that nonbinary falls under female.
# Check all unique names under gender
unique(marathons_all$gender)
## [1] "M" "F" "U" "m" "male" "female" "W" "X"
## [9] ""
We can see that there are a lot of ways ‘male’, ‘female’, ‘nonbinary’, and ‘unknown’ are labled.
Make all the genders uniform and standardized (KB)
marathons_all <- marathons_all %>%
mutate(
gender = tolower(gender),
gender = case_when(
gender %in% c("male", "m") ~ "male",
gender %in% c("female", "f", "w", "x", "nonbinary", "nb") ~ "female",
TRUE ~ "unknown"
)
)
table(marathons_all$gender)
##
## female male unknown
## 1132049 2052858 56
We can see that now we only have three genders, female,
male, and unknown which consist of 56
rows.
Remove unknowns since we have only 56 unknowns, and they will not help the model and most likely add noise: (KB)
marathons_all <- marathons_all %>%
filter(gender != "unknown")
table(marathons_all$gender)
##
## female male
## 1132049 2052858
We successfully removed the unknowns, leaving us with the desired
female and male.
Create a variable called winner_time, that consist of the winners or best finishing for that year. Then create a variable called time_ratio (), that consists of chip_seconds / winner_time.
# Add winner_time for each marathon-year-gender
runners <- marathons_all %>%
group_by(marathon, year, gender) %>%
mutate(winner_time = min(chip_seconds, na.rm = TRUE)) %>%
ungroup() %>%
mutate(time_ratio = chip_seconds / winner_time)
Create subgroups based on time_ratio. Look at a histogram to adjust the boundaries of the subgroups based on the distribution of the histogram of time_ratio and clear clusters. (KB)
runners <- runners %>%
mutate(
subgroup = case_when(
time_ratio <= 1.30 ~ "elite",
time_ratio > 1.30 & time_ratio <= 1.55 ~ "competitive",
time_ratio > 1.55 & time_ratio <= 1.80 ~ "average",
time_ratio > 1.80 & time_ratio <= 2.10 ~ "recreational",
time_ratio > 2.10 ~ "slow",
TRUE ~ NA_character_
),
subgroup = factor(subgroup, levels = c("elite", "competitive", "average", "recreational", "slow"))
)
# Create histogram of ratio_time amongst male and female runners with boundaries for subgroups in red
ggplot(runners, aes(x = time_ratio, fill = gender)) +
geom_histogram(binwidth = 0.05, position = "dodge") +
scale_fill_manual(values = c("female" = "deeppink", "male" = "steelblue")) +
geom_vline(xintercept = c(1.3, 1.55, 1.80, 2.10), linetype = "dashed", color = "red") +
labs(
title = "Distribution of Runner Time Ratios to Winner by Gender",
x = "Time Ratio (Runner / Winner)",
y = "Count"
) +
theme_minimal()
Looking at the histogram of the time_ratio above, we can see that there is a clear clustering in runner-to-winner time ratios and a long right-skewed tail, which helped with the placement of the performance thresholds/ vertical dashed red lines (1.30, 1.55, 1.80, 2.10). We can also see that the distribution for both male and female follow a similar shape, with overall less female relative male runners. Overall, this figure supports the use of ratio-based performance categories by illustrating natural clustering and skewness in marathon finishing times.
Now we can further divide the subgroups created by genders. (KB)
runners %>%
group_by(gender, subgroup) %>%
summarise(count = n()) %>%
arrange(gender, subgroup)
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
## # A tibble: 10 × 3
## # Groups: gender [2]
## gender subgroup count
## <chr> <fct> <int>
## 1 female elite 12412
## 2 female competitive 127521
## 3 female average 293867
## 4 female recreational 356394
## 5 female slow 341855
## 6 male elite 29047
## 7 male competitive 233651
## 8 male average 482666
## 9 male recreational 624377
## 10 male slow 683117
We can see that the subgroups now have groups for male
and female. We can see that the elite groups both has less
runners and male runners have more runners in each group overall. We
will leave this as is because these cutoffs are not arbitrary, and they
follow the natural shape of the data rather than relying on fixed
percentile breaks. Weighting could later be applied; however, it is not
initially because subgroup effects are not the primary focus.
avg_times <- runners %>%
group_by(marathon, year, gender, subgroup) %>%
summarize(
n = n(), # number of runners in each subgroup
avg_chip_seconds = mean(chip_seconds, na.rm = TRUE),
.groups = "drop"
)
avg_times
## # A tibble: 1,005 × 6
## marathon year gender subgroup n avg_chip_seconds
## <chr> <int> <chr> <fct> <int> <dbl>
## 1 Berlin 1996 female elite 131 10641.
## 2 Berlin 1996 female competitive 541 12733.
## 3 Berlin 1996 female average 901 14707.
## 4 Berlin 1996 female recreational 402 16821.
## 5 Berlin 1996 female slow 60 19054.
## 6 Berlin 1996 male elite 737 9468.
## 7 Berlin 1996 male competitive 3416 11173.
## 8 Berlin 1996 male average 5411 12974.
## 9 Berlin 1996 male recreational 3439 14841.
## 10 Berlin 1996 male slow 894 17374.
## # ℹ 995 more rows
We now have a dataset that contains 1005 rows and 6 columns/ variables. This means that some of the years must be missing for some of the marathon as we know previously when collecting the raw data. Berlin goes from (1996 to 2019), Boston goes from (1996 to 2019), Chicago goes from (1996 to 2023), and NYC goes from (1996 to 2024). Also, after looking through the dataset more, we can see that Berlin is missing data from all female subgroups in 2019 and NYC is missing all male subgroups for 2024. We will leave the data as is because having missing years and genders is not our primary focus, as our focus is on having as much data for finishing/chip times as possible.
final_data <- avg_times %>%
left_join(weather_clean, by = c("year", "marathon")) %>%
left_join(airquality_clean, by = c("year", "marathon"))
# move n (number of individuals representing each group) to the first column for neatness
final_data <- final_data %>%
select(n, everything())
final_data
## # A tibble: 1,005 × 21
## n marathon year gender subgroup avg_chip_seconds high_temp low_temp
## <int> <chr> <int> <chr> <fct> <dbl> <int> <int>
## 1 131 Berlin 1996 female elite 10641. 57 44
## 2 541 Berlin 1996 female competitive 12733. 57 44
## 3 901 Berlin 1996 female average 14707. 57 44
## 4 402 Berlin 1996 female recreational 16821. 57 44
## 5 60 Berlin 1996 female slow 19054. 57 44
## 6 737 Berlin 1996 male elite 9468. 57 44
## 7 3416 Berlin 1996 male competitive 11173. 57 44
## 8 5411 Berlin 1996 male average 12974. 57 44
## 9 3439 Berlin 1996 male recreational 14841. 57 44
## 10 894 Berlin 1996 male slow 17374. 57 44
## # ℹ 995 more rows
## # ℹ 13 more variables: avg_temp <dbl>, precipitation <dbl>, dew_point <dbl>,
## # wind_speed <int>, visibility <dbl>, sea_level_pressure <dbl>, aqi <int>,
## # main_pollutant <chr>, co <dbl>, ozone <dbl>, pm10 <dbl>, pm25 <dbl>,
## # no2 <dbl>
str(final_data)
## tibble [1,005 × 21] (S3: tbl_df/tbl/data.frame)
## $ n : int [1:1005] 131 541 901 402 60 737 3416 5411 3439 894 ...
## $ marathon : chr [1:1005] "Berlin" "Berlin" "Berlin" "Berlin" ...
## $ year : int [1:1005] 1996 1996 1996 1996 1996 1996 1996 1996 1996 1996 ...
## $ gender : chr [1:1005] "female" "female" "female" "female" ...
## $ subgroup : Factor w/ 5 levels "elite","competitive",..: 1 2 3 4 5 1 2 3 4 5 ...
## $ avg_chip_seconds : num [1:1005] 10641 12733 14707 16821 19054 ...
## $ high_temp : int [1:1005] 57 57 57 57 57 57 57 57 57 57 ...
## $ low_temp : int [1:1005] 44 44 44 44 44 44 44 44 44 44 ...
## $ avg_temp : num [1:1005] 52.3 52.3 52.3 52.3 52.3 ...
## $ precipitation : num [1:1005] 0.23 0.23 0.23 0.23 0.23 0.23 0.23 0.23 0.23 0.23 ...
## $ dew_point : num [1:1005] 48.6 48.6 48.6 48.6 48.6 ...
## $ wind_speed : int [1:1005] 10 10 10 10 10 10 10 10 10 10 ...
## $ visibility : num [1:1005] NA NA NA NA NA NA NA NA NA NA ...
## $ sea_level_pressure: num [1:1005] 30.1 30.1 30.1 30.1 30.1 ...
## $ aqi : int [1:1005] 11 11 11 11 11 11 11 11 11 11 ...
## $ main_pollutant : chr [1:1005] "NO2" "NO2" "NO2" "NO2" ...
## $ co : num [1:1005] NA NA NA NA NA NA NA NA NA NA ...
## $ ozone : num [1:1005] 10 10 10 10 10 10 10 10 10 10 ...
## $ pm10 : num [1:1005] NA NA NA NA NA NA NA NA NA NA ...
## $ pm25 : num [1:1005] NA NA NA NA NA NA NA NA NA NA ...
## $ no2 : num [1:1005] 11 11 11 11 11 11 11 11 11 11 ...
We can see that all the datasets were merged into one final_data
successfully. We now have a new column n that counts the
amount of runners in each group for future computations and potential
weighing if needed.
# Select the continuous variables
continuous_summary <- final_data %>%
select(avg_chip_seconds, high_temp, low_temp, avg_temp, precipitation,
dew_point, wind_speed, visibility, sea_level_pressure, aqi, pm10, pm25, no2, ozone, co) %>%
summarise_all(list(
mean = ~mean(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
max = ~max(., na.rm = TRUE)
))
continuous_summary
## # A tibble: 1 × 75
## avg_chip_seconds_mean high_temp_mean low_temp_mean avg_temp_mean
## <dbl> <dbl> <dbl> <dbl>
## 1 14081. 61.6 46.0 53.7
## # ℹ 71 more variables: precipitation_mean <dbl>, dew_point_mean <dbl>,
## # wind_speed_mean <dbl>, visibility_mean <dbl>,
## # sea_level_pressure_mean <dbl>, aqi_mean <dbl>, pm10_mean <dbl>,
## # pm25_mean <dbl>, no2_mean <dbl>, ozone_mean <dbl>, co_mean <dbl>,
## # avg_chip_seconds_median <dbl>, high_temp_median <int>,
## # low_temp_median <int>, avg_temp_median <dbl>, precipitation_median <dbl>,
## # dew_point_median <dbl>, wind_speed_median <int>, visibility_median <dbl>, …
# Select continuous variables
continuous_summary_grouped <- final_data %>%
group_by(gender, subgroup) %>% # group by gender and subgroup
summarise(
avg_chip_seconds_mean = mean(avg_chip_seconds, na.rm = TRUE),
avg_chip_seconds_sd = sd(avg_chip_seconds, na.rm = TRUE),
high_temp_mean = mean(high_temp, na.rm = TRUE),
high_temp_sd = sd(high_temp, na.rm = TRUE),
low_temp_mean = mean(low_temp, na.rm = TRUE),
low_temp_sd = sd(low_temp, na.rm = TRUE),
avg_temp_mean = mean(avg_temp, na.rm = TRUE),
avg_temp_sd = sd(avg_temp, na.rm = TRUE),
precipitation_mean = mean(precipitation, na.rm = TRUE),
precipitation_sd = sd(precipitation, na.rm = TRUE),
dew_point_mean = mean(dew_point, na.rm = TRUE),
dew_point_sd = sd(dew_point, na.rm = TRUE),
wind_speed_mean = mean(wind_speed, na.rm = TRUE),
wind_speed_sd = sd(wind_speed, na.rm = TRUE),
visibility_mean = mean(visibility, na.rm = TRUE),
visibility_sd = sd(visibility, na.rm = TRUE),
sea_level_pressure_mean = mean(sea_level_pressure, na.rm = TRUE),
sea_level_pressure_sd = sd(sea_level_pressure, na.rm = TRUE),
aqi_mean = mean(aqi, na.rm = TRUE),
aqi_sd = sd(aqi, na.rm = TRUE),
pm10_mean = mean(pm10, na.rm = TRUE),
pm10_sd = sd(pm10, na.rm = TRUE),
pm25_mean = mean(pm25, na.rm = TRUE),
pm25_sd = sd(pm25, na.rm = TRUE),
no2_mean = mean(no2, na.rm = TRUE),
no2_sd = sd(no2, na.rm = TRUE),
ozone_mean = mean(ozone, na.rm = TRUE),
ozone_sd = sd(ozone, na.rm = TRUE),
co_mean = mean(co, na.rm = TRUE),
co_sd = sd(co, na.rm = TRUE)
) %>%
arrange(gender, subgroup)
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
continuous_summary_grouped
## # A tibble: 10 × 32
## # Groups: gender [2]
## gender subgroup avg_chip_seconds_mean avg_chip_seconds_sd high_temp_mean
## <chr> <fct> <dbl> <dbl> <dbl>
## 1 female elite 10426. 334. 61.6
## 2 female competitive 12612. 319. 61.6
## 3 female average 14479. 386. 61.6
## 4 female recreational 16718. 449. 61.6
## 5 female slow 20177. 810. 61.6
## 6 male elite 9350. 223. 61.6
## 7 male competitive 11101. 225. 61.6
## 8 male average 12870. 263. 61.6
## 9 male recreational 14834. 317. 61.6
## 10 male slow 18278. 615. 61.6
## # ℹ 27 more variables: high_temp_sd <dbl>, low_temp_mean <dbl>,
## # low_temp_sd <dbl>, avg_temp_mean <dbl>, avg_temp_sd <dbl>,
## # precipitation_mean <dbl>, precipitation_sd <dbl>, dew_point_mean <dbl>,
## # dew_point_sd <dbl>, wind_speed_mean <dbl>, wind_speed_sd <dbl>,
## # visibility_mean <dbl>, visibility_sd <dbl>, sea_level_pressure_mean <dbl>,
## # sea_level_pressure_sd <dbl>, aqi_mean <dbl>, aqi_sd <dbl>, pm10_mean <dbl>,
## # pm10_sd <dbl>, pm25_mean <dbl>, pm25_sd <dbl>, no2_mean <dbl>, …
Select the continuous variables to make a clean table: (KB)
# Select continuous variables
continuous_vars <- final_data %>%
select(
avg_chip_seconds,
high_temp,
low_temp,
avg_temp,
dew_point,
wind_speed,
visibility,
sea_level_pressure,
aqi,
co,
ozone,
pm10,
pm25,
no2
)
str(continuous_vars)
## tibble [1,005 × 14] (S3: tbl_df/tbl/data.frame)
## $ avg_chip_seconds : num [1:1005] 10641 12733 14707 16821 19054 ...
## $ high_temp : int [1:1005] 57 57 57 57 57 57 57 57 57 57 ...
## $ low_temp : int [1:1005] 44 44 44 44 44 44 44 44 44 44 ...
## $ avg_temp : num [1:1005] 52.3 52.3 52.3 52.3 52.3 ...
## $ dew_point : num [1:1005] 48.6 48.6 48.6 48.6 48.6 ...
## $ wind_speed : int [1:1005] 10 10 10 10 10 10 10 10 10 10 ...
## $ visibility : num [1:1005] NA NA NA NA NA NA NA NA NA NA ...
## $ sea_level_pressure: num [1:1005] 30.1 30.1 30.1 30.1 30.1 ...
## $ aqi : int [1:1005] 11 11 11 11 11 11 11 11 11 11 ...
## $ co : num [1:1005] NA NA NA NA NA NA NA NA NA NA ...
## $ ozone : num [1:1005] 10 10 10 10 10 10 10 10 10 10 ...
## $ pm10 : num [1:1005] NA NA NA NA NA NA NA NA NA NA ...
## $ pm25 : num [1:1005] NA NA NA NA NA NA NA NA NA NA ...
## $ no2 : num [1:1005] 11 11 11 11 11 11 11 11 11 11 ...
We can see that there are a total of 14 continuous variables.
Find Statistics for only the continuous variables: (KB)
continuous_only <- continuous_vars %>%
select(where(is.numeric))
continuous_summary <- describe(continuous_only) %>%
as.data.frame() %>%
select(
mean,
sd,
median,
min,
max,
skew,
kurtosis,
n
)
continuous_summary
## mean sd median min max
## avg_chip_seconds 14080.542935 3355.8623052 13566.35 8912.405 21729.70
## high_temp 61.621891 10.4803135 61.00 44.000 88.00
## low_temp 46.009950 7.9703072 45.00 28.000 72.00
## avg_temp 53.718209 8.6975811 52.88 29.580 79.35
## dew_point 41.023831 10.7187978 42.40 21.130 65.22
## wind_speed 13.119403 6.7699588 13.00 3.000 39.00
## visibility 9.848072 1.4872173 10.00 6.060 20.00
## sea_level_pressure 29.925970 0.3391476 29.98 29.130 30.54
## aqi 50.572139 21.7623899 52.00 11.000 119.00
## co 13.948052 11.7791733 9.00 2.000 56.00
## ozone 36.179104 18.0530913 34.00 7.000 100.00
## pm10 23.576000 13.0305755 21.00 5.000 69.00
## pm25 57.510345 16.7892845 53.00 21.000 119.00
## no2 31.915423 14.9253725 32.00 7.000 66.00
## skew kurtosis n
## avg_chip_seconds 0.39027498 -0.8608475 1005
## high_temp 0.57757331 -0.3573111 1005
## low_temp 0.77231953 1.0186755 1005
## avg_temp 0.42260873 0.3116392 1005
## dew_point -0.08246867 -0.9095537 1005
## wind_speed 0.78517635 1.0872273 1005
## visibility 2.79083334 25.7005389 830
## sea_level_pressure -0.41101449 -0.7527869 1005
## aqi 0.32111228 0.2198760 1005
## co 1.58769285 2.1592651 770
## ozone 1.22285653 2.3430336 1005
## pm10 0.89184403 0.8083714 625
## pm25 0.85098957 1.3911927 725
## no2 0.15839726 -0.7783209 1005
Add continuous variables to a clean table: (KB)
| variable | mean | sd | median | min | max | skew | kurtosis | n |
|---|---|---|---|---|---|---|---|---|
| avg_chip_seconds | 15631.62 | 2938.88 | 15284.20 | 10304.77 | 23679.03 | 0.51 | -0.39 | 1005 |
| high_temp | 61.62 | 10.48 | 61.00 | 44.00 | 88.00 | 0.58 | -0.36 | 1005 |
| low_temp | 46.01 | 7.97 | 45.00 | 28.00 | 72.00 | 0.77 | 1.02 | 1005 |
| avg_temp | 53.72 | 8.70 | 52.88 | 29.58 | 79.35 | 0.42 | 0.31 | 1005 |
| dew_point | 41.02 | 10.72 | 42.40 | 21.13 | 65.22 | -0.08 | -0.91 | 1005 |
| wind_speed | 13.12 | 6.77 | 13.00 | 3.00 | 39.00 | 0.79 | 1.09 | 1005 |
| visibility | 9.85 | 1.49 | 10.00 | 6.06 | 20.00 | 2.79 | 25.70 | 830 |
| sea_level_pressure | 29.93 | 0.34 | 29.98 | 29.13 | 30.54 | -0.41 | -0.75 | 1005 |
| aqi | 50.57 | 21.76 | 52.00 | 11.00 | 119.00 | 0.32 | 0.22 | 1005 |
| co | 13.95 | 11.78 | 9.00 | 2.00 | 56.00 | 1.59 | 2.16 | 770 |
| ozone | 36.18 | 18.05 | 34.00 | 7.00 | 100.00 | 1.22 | 2.34 | 1005 |
| pm10 | 23.58 | 13.03 | 21.00 | 5.00 | 69.00 | 0.89 | 0.81 | 625 |
| pm25 | 57.51 | 16.79 | 53.00 | 21.00 | 119.00 | 0.85 | 1.39 | 725 |
| no2 | 31.92 | 14.93 | 32.00 | 7.00 | 66.00 | 0.16 | -0.78 | 1005 |
The table above summarizes key descriptive statistics for continuous variables in the dataset, including average marathon chip time (in seconds), temperature measures, humidity-related metrics (dew point), wind speed, visibility, air pressure, and air quality indicators (aqi, co, ozone, pm10, pm2.5, no2). For each variable, the table reports its mean, standard deviation, median, minimum, maximum, skewness, kurtosis, and sample size, giving an overview of central tendency, variability, distribution shape, and data completeness.
All the variables show reasonable ranges and central values for weather and airquality conditions during the marathon events. We can see that chip times cluster around 4 hours (about 15,600 seconds) with moderate variability. Temperature measures (high, low, and average) fall within typical seasonal ranges, while air-quality variables such as ozone, pm10, pm2.5, and co show right-skewed distributions, indicating occasional higher pollution days. Visibility also shows strong right skew and high kurtosis, suggesting a few unusually high values compared with the rest of the data. Overall, most variables are moderately skewed, with some (like visibility and co) showing heavier tails.
library(ggplot2) # load plotting library
ggplot(final_data, aes(x = avg_chip_seconds / 3600)) + # convert from seconds to hours
geom_histogram(bins = 20, fill = "steelblue", color = "black") + # create a histogram
labs(title = "Distribution of Average Finishing Times", # generate labels
x = "Average Finishing Time (hours)",
y = "Frequency"
)
ggplot(final_data, aes(x = avg_temp, y = avg_chip_seconds / 3600, color = marathon)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, color = "black") +
facet_wrap(~ marathon, scales = "free_x") + # each marathon has own x-axis scale
labs(title = "Overview of the Effect of Average Temperature on Finishing Time by Marathon",
x = "Average Temperature (°F)",
y = "Finishing Time (hours)"
)
## `geom_smooth()` using formula = 'y ~ x'
ggplot(final_data, aes(x = aqi, y = avg_chip_seconds / 3600, color = marathon)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, color = "black") +
facet_wrap(~ marathon, scales = "free_x") +
labs(title = "Overview of the Relationship Between Air Quality and Finishing Time by Marathon",
x = "Air Quality Index (AQI)",
y = "Average Finishing Time (hours)"
) +
theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'
# select only the numeric variables
numeric_vars <- final_data %>%
select(avg_chip_seconds, avg_temp, precipitation, dew_point,
wind_speed, visibility, sea_level_pressure,
aqi, pm10, pm25, no2, ozone, co)
# create correlation matrix
cor_matrix <- cor(numeric_vars, use = "complete.obs")
## Warning in cor(numeric_vars, use = "complete.obs"): the standard deviation is
## zero
# round the correlations
round(cor_matrix, 2)
## avg_chip_seconds avg_temp precipitation dew_point wind_speed
## avg_chip_seconds 1.00 0.01 0.01 0.02 0.01
## avg_temp 0.01 1.00 0.12 0.86 -0.15
## precipitation 0.01 0.12 1.00 0.27 0.40
## dew_point 0.02 0.86 0.27 1.00 -0.05
## wind_speed 0.01 -0.15 0.40 -0.05 1.00
## visibility NA NA NA NA NA
## sea_level_pressure 0.03 -0.41 -0.15 -0.35 0.19
## aqi 0.01 0.30 -0.25 0.35 -0.47
## pm10 -0.01 0.27 -0.33 0.21 -0.41
## pm25 0.01 0.29 -0.35 0.31 -0.53
## no2 0.01 0.21 -0.18 0.18 -0.50
## ozone 0.03 0.61 -0.01 0.52 0.03
## co 0.00 0.02 -0.25 0.05 -0.45
## visibility sea_level_pressure aqi pm10 pm25 no2 ozone
## avg_chip_seconds NA 0.03 0.01 -0.01 0.01 0.01 0.03
## avg_temp NA -0.41 0.30 0.27 0.29 0.21 0.61
## precipitation NA -0.15 -0.25 -0.33 -0.35 -0.18 -0.01
## dew_point NA -0.35 0.35 0.21 0.31 0.18 0.52
## wind_speed NA 0.19 -0.47 -0.41 -0.53 -0.50 0.03
## visibility 1 NA NA NA NA NA NA
## sea_level_pressure NA 1.00 -0.11 -0.41 -0.12 -0.10 -0.07
## aqi NA -0.11 1.00 0.75 0.98 0.74 0.46
## pm10 NA -0.41 0.75 1.00 0.75 0.62 0.32
## pm25 NA -0.12 0.98 0.75 1.00 0.71 0.39
## no2 NA -0.10 0.74 0.62 0.71 1.00 0.48
## ozone NA -0.07 0.46 0.32 0.39 0.48 1.00
## co NA -0.10 0.58 0.47 0.59 0.65 0.10
## co
## avg_chip_seconds 0.00
## avg_temp 0.02
## precipitation -0.25
## dew_point 0.05
## wind_speed -0.45
## visibility NA
## sea_level_pressure -0.10
## aqi 0.58
## pm10 0.47
## pm25 0.59
## no2 0.65
## ozone 0.10
## co 1.00
#Correlation Heat map
corrplot(
cor_matrix,
method="color",col=colorRampPalette(c("darkblue", "white", "red"))(200),
addCoef.col="black", tl.col="black", tl.srt=45
)
Looking at all the pollutants we have concerns with (co, pm10, pm2.5, and overall aqi).
# Make pollutant data into long format for graphing
pollutant_long <- final_data %>%
mutate(
finish_hours = avg_chip_seconds / 3600 # convert seconds to hours
) %>%
pivot_longer(
cols = c(co, pm25, pm10, aqi),
names_to = "pollutant",
values_to = "value"
)
# plotting scatter plots
ggplot(pollutant_long,
aes(x = value, y = finish_hours, color = pollutant)) +
geom_point(alpha = 0.25) +
geom_smooth(se = FALSE, method = "loess") +
facet_wrap(~ subgroup, scales = "free") +
scale_color_brewer(palette = "Dark2") +
labs(
title = "Pollution vs. Finish Times by Performance Subgroup",
x = "Pollutant Level (AQI)",
y = "Average Finishing Time (hours)",
color = "Pollutant"
) +
theme_minimal(base_size = 14)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 895 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 895 rows containing missing values or values outside the scale range
## (`geom_point()`).
We can see that most of the subgroups follow similar patterns with
airquality. As airquality increase, we see an increase in most average
finishing times, with some starting to level out at a certain pollutant
level among faster subgroups, such as pm2.5 for elite and
competitive subgroups. We can also see that co behaves
oddly, specifically for slow runners, increasing as fishing time get
faster with higher pollutant levels. However this is most likey due to
missing co data for Berlin.
continuous_vars <- final_data %>%
select(avg_chip_seconds, high_temp, low_temp, avg_temp,
dew_point, wind_speed, visibility, sea_level_pressure,
aqi, co, ozone, pm10, pm25, no2) %>%
pivot_longer(everything(), names_to = "variable", values_to = "value")
ggplot(continuous_vars, aes(x = value)) +
geom_histogram(bins = 30, fill = "#4A90E2", color = "white") +
facet_wrap(~ variable, scales = "free") +
theme_minimal() +
labs(title = "Histograms of Continuous Variables",
x = "Value",
y = "Count")
## Warning: Removed 1070 rows containing non-finite outside the scale range
## (`stat_bin()`).
Looking at histograms for all the continuous variables, we can see the distribution shapes. Several of the environmental variables did display expected right skews (such as PM2.5 and wind speed). There is no transformation currently planned for our modeling goals. We can also see that finishing times were right skewed, which is typical for marathon data.
We want to see how subgroups behave over the years.
# Scatter plots by marathon and subgroup
ggplot(final_data, aes(x = year, y = avg_chip_seconds)) +
geom_point(aes(color = subgroup), alpha = 0.7, size = 2) + # points colored by subgroup
geom_smooth(aes(color = subgroup), method = "loess", se = FALSE) + # optional trend lines
facet_wrap(~ marathon) + # one plot per marathon
labs(
title = "Average Marathon Finishing Times by Year and Subgroup",
x = "Year",
y = "Average Finishing Time (seconds)",
color = "Performance Subgroup"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom"
)
## `geom_smooth()` using formula = 'y ~ x'
We cans see that over time, most finishing times become faster. We can also see potential shifts in faster finishing times past 2018, where there was the introduction of supershoes. We can really see this in the Chicago marathon, as it has data till 2023. We can potentially see the trend starting with Berlin, however it is hard to tell since the data is only available till 2019. NYC seems to have a pretty constant finishing times over the years. Boston on the other hand, seems to have increasing finishing times, however it is also hard to interpret the affect of supershoes due to data only being available till 2019, only one year after supershoes have been widely introduced.
#Add 'Overall' graph, making df 'final_data2' for purpose of figure 5 and 6
final_data2 <- final_data %>% mutate(subgroup = tolower(as.character(subgroup))) %>% bind_rows(final_data %>%
mutate(subgroup = "overall")) %>%
mutate(subgroup = factor(subgroup,levels = c("elite", "competitive", "average", "recreational", "slow", "overall")))
final_data2$year <- as.numeric(as.character(final_data2$year))
#Figure 5: Visibility and Finishing time
ggplot(final_data2, aes(x = visibility, y = avg_chip_seconds / 3600, color = marathon)) +
geom_point(alpha = 0.6) +
geom_smooth(aes(linetype=gender),method = "lm", se = FALSE, color = "black") +
facet_wrap(~ subgroup, scales = "free_x") +
labs(title = "Overview of the Relationship Between Visibility and Finishing Time by Performance Subgroup",
x = "Visibility (mi)",
y = "Average Finishing Time (hours)"
) +
theme(legend.position = "right")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 350 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 350 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Average Finishing Time by Year, Subgroup, Gender (Supershoes)
ggplot(final_data2, aes(x = year, y = avg_chip_seconds, color = marathon)) +
geom_point(alpha = 0.6) +
geom_smooth(aes(linetype=gender),method = "lm", se = FALSE, color = "black") +
facet_wrap(~ subgroup, scales = "free_x") +
geom_vline(xintercept = 2018, linetype = "dashed", color = "red", linewidth = 1) + # vertical line
labs(title = "Average Finishing Time Over the Years",
x = "Year",
y = "Average Finishing Time (hours)"
) +
theme(legend.position = "right")
## `geom_smooth()` using formula = 'y ~ x'
#Boston
boston <- final_data %>% filter(marathon=="Boston")
#Summarize
summary_long <- boston %>%
group_by(year, gender) %>%
summarise(
N=n(),
mean_time=mean(avg_chip_seconds, na.rm = TRUE),
sd_time=sd(avg_chip_seconds, na.rm = TRUE),
min_time=min(avg_chip_seconds, na.rm = TRUE),
max_time=max(avg_chip_seconds, na.rm = TRUE),
.groups="drop") %>%
mutate(mean_sd= sprintf("%.1f (%.1f)", mean_time, sd_time),
min_max= sprintf("(%.1f, %.1f)", min_time, max_time))
#Pivot wider — ensure one row per year
summary_wide <- summary_long %>%
pivot_wider(id_cols=year,
names_from=gender,
values_from=c(N, mean_sd, min_max),
names_glue="{gender}_{.value}") %>% arrange(year)
#Combine into one column per gender for GT
summary_combined <- summary_wide %>%
mutate(Female=paste0("N = ", female_N, "<br>", female_mean_sd, "<br>", female_min_max),
Male=paste0("N = ", male_N, "<br>", male_mean_sd, "<br>", male_min_max)) %>%
select(year, Female, Male)
#GT table
summary_combined %>%
gt(rowname_col = "year") %>%
cols_label(Female = "Female",
Male = "Male") %>%
fmt_markdown(columns = c(Female, Male)) %>%
tab_options(
data_row.padding = px(3),
table.font.size = px(14))
| Female | Male | |
|---|---|---|
| 1996 | N = 5 15217.2 (3847.8) (10660.5, 20672.9) |
N = 5 13472.0 (3508.2) (9553.1, 18590.4) |
| 1997 | N = 5 13811.4 (3144.6) (9696.1, 17793.1) |
N = 5 13249.5 (3041.8) (9535.2, 17253.4) |
| 1998 | N = 5 14294.7 (3424.4) (10018.3, 18893.5) |
N = 5 13138.4 (3231.1) (9425.3, 17716.3) |
| 1999 | N = 5 14642.4 (3451.0) (10369.0, 19272.6) |
N = 5 13356.1 (3282.8) (9509.8, 17959.3) |
| 2000 | N = 5 14850.7 (3411.3) (10705.8, 19436.7) |
N = 5 13323.5 (3209.8) (9622.5, 17823.0) |
| 2001 | N = 5 14606.2 (3445.5) (10318.3, 19216.4) |
N = 5 13322.3 (3227.2) (9546.6, 17838.4) |
| 2002 | N = 5 14278.3 (3248.7) (10226.8, 18574.5) |
N = 5 13204.1 (3157.6) (9460.1, 17535.6) |
| 2003 | N = 5 14845.7 (3465.3) (10650.3, 19605.2) |
N = 5 13424.3 (3247.2) (9650.1, 18005.1) |
| 2004 | N = 5 14792.1 (3541.7) (10309.1, 19535.2) |
N = 5 13520.7 (3352.3) (9555.5, 18208.5) |
| 2005 | N = 5 14791.0 (3356.0) (10617.7, 19230.4) |
N = 5 13534.0 (3258.8) (9690.4, 18055.3) |
| 2006 | N = 5 14657.8 (3404.6) (10478.7, 19257.0) |
N = 5 13118.3 (3196.2) (9420.5, 17656.8) |
| 2007 | N = 5 15277.1 (3581.3) (11047.9, 20232.7) |
N = 5 13818.4 (3411.5) (9879.1, 18629.0) |
| 2008 | N = 5 14905.5 (3483.9) (10797.0, 19738.4) |
N = 5 13202.1 (3270.6) (9430.7, 17871.7) |
| 2009 | N = 5 15485.4 (3573.9) (11283.2, 20372.2) |
N = 5 13271.9 (3289.2) (9487.4, 17965.4) |
| 2010 | N = 5 14939.6 (3513.2) (10767.8, 19803.8) |
N = 5 13024.1 (3265.3) (9255.5, 17713.6) |
| 2011 | N = 5 14621.4 (3509.4) (10412.0, 19502.2) |
N = 5 12748.4 (3222.8) (9014.7, 17382.0) |
| 2012 | N = 5 15681.1 (3883.5) (11132.5, 21181.7) |
N = 5 13866.2 (3643.5) (9764.6, 19188.8) |
| 2014 | N = 5 14440.7 (3713.6) (10148.2, 19797.4) |
N = 5 13430.5 (3582.6) (9529.5, 18729.3) |
| 2015 | N = 5 14863.0 (3531.3) (10770.7, 19804.4) |
N = 5 13366.7 (3388.6) (9615.3, 18264.4) |
| 2016 | N = 5 15277.7 (3580.5) (11097.3, 20215.0) |
N = 5 13709.5 (3375.2) (9905.1, 18518.7) |
| 2017 | N = 5 14592.5 (3513.9) (10421.3, 19499.9) |
N = 5 13425.3 (3374.8) (9610.4, 18277.0) |
| 2018 | N = 5 16339.2 (3926.8) (11859.0, 21729.7) |
N = 5 14084.6 (3607.0) (10131.2, 19323.7) |
| 2019 | N = 5 14752.6 (3570.0) (10589.1, 19750.8) |
N = 5 13266.6 (3414.7) (9468.8, 18201.8) |
#Berlin
berlin<- final_data %>% filter(marathon=="Berlin")
#Summarize
summary_long <- berlin %>%
group_by(year, gender) %>%
summarise(
N=n(),
mean_time=mean(avg_chip_seconds, na.rm = TRUE),
sd_time=sd(avg_chip_seconds, na.rm = TRUE),
min_time=min(avg_chip_seconds, na.rm = TRUE),
max_time=max(avg_chip_seconds, na.rm = TRUE),
.groups="drop") %>%
mutate(mean_sd= sprintf("%.1f (%.1f)", mean_time, sd_time),
min_max= sprintf("(%.1f, %.1f)", min_time, max_time))
#Pivot wider — ensure one row per year
summary_wide <- summary_long %>%
pivot_wider(id_cols=year,
names_from=gender,
values_from=c(N, mean_sd, min_max),
names_glue="{gender}_{.value}") %>% arrange(year)
#Combine into one column per gender for GT
summary_combined <- summary_wide %>%
mutate(Female=paste0("N = ", female_N, "<br>", female_mean_sd, "<br>", female_min_max),
Male=paste0("N = ", male_N, "<br>", male_mean_sd, "<br>", male_min_max)) %>%
select(year, Female, Male)
#GT table
summary_combined %>%
gt(rowname_col = "year") %>%
cols_label(Female = "Female",
Male = "Male") %>%
fmt_markdown(columns = c(Female, Male)) %>%
tab_options(
data_row.padding = px(3),
table.font.size = px(14))
| Female | Male | |
|---|---|---|
| 1996 | N = 5 14791.1 (3307.5) (10640.5, 19053.5) |
N = 5 13166.0 (3090.2) (9468.1, 17374.3) |
| 1997 | N = 5 14553.9 (3399.7) (10226.1, 18986.8) |
N = 5 13065.5 (3136.0) (9267.9, 17323.2) |
| 1998 | N = 5 14831.3 (3489.7) (10463.8, 19463.4) |
N = 5 12991.0 (3130.8) (9265.4, 17315.3) |
| 1999 | N = 5 14321.4 (3434.2) (9902.6, 18792.8) |
N = 5 13016.2 (3138.6) (9235.9, 17305.9) |
| 2000 | N = 5 14889.6 (3643.1) (10211.3, 19680.0) |
N = 5 13201.5 (3234.6) (9380.3, 17682.0) |
| 2001 | N = 5 14331.0 (3388.5) (10059.9, 18813.7) |
N = 5 13228.3 (3182.6) (9430.4, 17587.7) |
| 2002 | N = 5 14532.6 (3398.5) (10375.5, 19074.9) |
N = 5 13056.9 (3163.9) (9316.1, 17422.1) |
| 2003 | N = 5 14979.7 (3619.7) (10463.7, 19812.7) |
N = 5 12932.0 (3195.9) (9187.7, 17400.3) |
| 2004 | N = 5 14375.5 (3469.3) (10077.3, 19041.5) |
N = 5 13083.3 (3211.0) (9311.8, 17572.8) |
| 2005 | N = 5 14371.0 (3492.3) (10106.9, 19105.7) |
N = 5 13235.4 (3279.2) (9418.8, 17848.1) |
| 2006 | N = 5 14606.4 (3572.6) (10217.6, 19437.7) |
N = 5 13110.9 (3293.4) (9308.0, 17790.1) |
| 2007 | N = 5 14725.4 (3590.2) (10298.0, 19600.3) |
N = 5 12895.0 (3236.4) (9088.5, 17454.1) |
| 2008 | N = 5 14377.1 (3491.0) (10144.3, 19126.5) |
N = 5 12874.5 (3215.0) (9157.7, 17434.6) |
| 2009 | N = 5 14877.3 (3618.0) (10459.1, 19778.9) |
N = 5 13098.7 (3295.5) (9268.8, 17755.2) |
| 2010 | N = 5 14811.0 (3572.6) (10492.0, 19717.1) |
N = 5 12959.0 (3212.2) (9200.4, 17478.9) |
| 2011 | N = 5 14411.8 (3614.6) (9935.7, 19335.9) |
N = 5 12866.1 (3287.5) (9043.1, 17550.3) |
| 2012 | N = 5 14516.7 (3516.1) (10271.8, 19355.7) |
N = 5 12914.7 (3240.3) (9167.5, 17530.7) |
| 2013 | N = 5 14564.7 (3536.2) (10271.5, 19393.9) |
N = 5 12824.1 (3216.3) (9085.9, 17404.6) |
| 2014 | N = 5 14494.4 (3596.9) (10134.7, 19452.1) |
N = 5 12816.1 (3294.9) (9019.4, 17551.3) |
| 2015 | N = 5 14408.1 (3612.2) (9990.7, 19376.8) |
N = 5 12894.2 (3316.4) (9063.2, 17642.5) |
| 2016 | N = 5 14627.9 (3614.5) (10395.8, 19712.4) |
N = 5 12874.3 (3350.5) (9098.7, 17745.8) |
| 2017 | N = 5 14559.3 (3628.9) (10254.6, 19637.8) |
N = 5 12897.2 (3342.5) (9098.2, 17727.7) |
| 2018 | N = 5 14423.9 (3696.1) (10126.9, 19687.9) |
N = 5 12795.0 (3428.9) (9005.3, 17861.4) |
| 2019 | N = NA NA NA |
N = 5 12736.8 (3393.8) (8912.4, 17696.4) |
#Chicago
chicago<- final_data %>% filter(marathon=="Chicago")
#Summarize
summary_long <- chicago %>%
group_by(year, gender) %>%
summarise(
N=n(),
mean_time=mean(avg_chip_seconds, na.rm = TRUE),
sd_time=sd(avg_chip_seconds, na.rm = TRUE),
min_time=min(avg_chip_seconds, na.rm = TRUE),
max_time=max(avg_chip_seconds, na.rm = TRUE),
.groups="drop") %>%
mutate(mean_sd= sprintf("%.1f (%.1f)", mean_time, sd_time),
min_max= sprintf("(%.1f, %.1f)", min_time, max_time))
#Pivot wider — ensure one row per year
summary_wide <- summary_long %>%
pivot_wider(id_cols=year,
names_from=gender,
values_from=c(N, mean_sd, min_max),
names_glue="{gender}_{.value}") %>% arrange(year)
#Combine into one column per gender for GT
summary_combined <- summary_wide %>%
mutate(Female=paste0("N = ", female_N, "<br>", female_mean_sd, "<br>", female_min_max),
Male=paste0("N = ", male_N, "<br>", male_mean_sd, "<br>", male_min_max)) %>%
select(year, Female, Male)
#GT table
summary_combined %>%
gt(rowname_col = "year") %>%
cols_label(Female = "Female",
Male = "Male") %>%
fmt_markdown(columns = c(Female, Male)) %>%
tab_options(
data_row.padding = px(3),
table.font.size = px(14))
| Female | Male | |
|---|---|---|
| 1996 | N = 5 15361.5 (3705.3) (10726.5, 20328.6) |
N = 5 13245.6 (3285.9) (9278.5, 17751.6) |
| 1997 | N = 5 15402.0 (3905.1) (10755.0, 20900.2) |
N = 5 13136.4 (3316.1) (9223.7, 17784.4) |
| 1998 | N = 5 14955.0 (3916.5) (10262.3, 20512.0) |
N = 5 13145.7 (3346.5) (9170.4, 17837.8) |
| 1999 | N = 5 15156.3 (3957.9) (10487.8, 20808.1) |
N = 5 13032.8 (3371.6) (9060.3, 17803.4) |
| 2000 | N = 5 14738.0 (3861.6) (10118.7, 20221.9) |
N = 5 13215.7 (3398.3) (9282.5, 18065.0) |
| 2001 | N = 5 14429.7 (3609.3) (10126.2, 19479.9) |
N = 5 13369.8 (3414.3) (9390.6, 18204.2) |
| 2002 | N = 5 14309.0 (3643.1) (10020.3, 19444.2) |
N = 5 13102.1 (3386.7) (9152.4, 17932.0) |
| 2003 | N = 5 14887.9 (3761.3) (10417.4, 20176.3) |
N = 5 13114.7 (3450.9) (9105.6, 18062.3) |
| 2004 | N = 5 14962.7 (3798.4) (10470.1, 20327.4) |
N = 5 13194.1 (3443.8) (9233.0, 18162.9) |
| 2005 | N = 5 14738.4 (3752.9) (10317.7, 20069.0) |
N = 5 13259.0 (3469.1) (9258.5, 18255.3) |
| 2006 | N = 5 14674.6 (3791.5) (10227.3, 20072.8) |
N = 5 13287.1 (3526.6) (9192.8, 18361.5) |
| 2007 | N = 5 15826.7 (3805.3) (11151.2, 20892.5) |
N = 5 13781.5 (3740.1) (9512.8, 19212.9) |
| 2008 | N = 5 15364.6 (3974.0) (10723.0, 21019.8) |
N = 5 13354.7 (3681.0) (9263.0, 18810.3) |
| 2009 | N = 5 15237.2 (3846.4) (10746.7, 20699.7) |
N = 5 13170.2 (3559.7) (9131.4, 18399.0) |
| 2010 | N = 5 15025.6 (3999.9) (10351.0, 20757.4) |
N = 5 13326.4 (3726.8) (9172.0, 18849.7) |
| 2011 | N = 5 14893.7 (3892.2) (10418.2, 20494.5) |
N = 5 13242.8 (3676.8) (9144.8, 18693.9) |
| 2012 | N = 5 14889.6 (3869.1) (10466.0, 20485.7) |
N = 5 13107.6 (3605.4) (9058.8, 18442.6) |
| 2013 | N = 5 14651.1 (3781.9) (10283.6, 20091.7) |
N = 5 13029.2 (3628.6) (8968.0, 18416.1) |
| 2014 | N = 5 15181.3 (3965.4) (10606.0, 20880.7) |
N = 5 13086.6 (3667.2) (8983.7, 18538.5) |
| 2015 | N = 5 14959.6 (3870.6) (10450.0, 20483.7) |
N = 5 13551.9 (3689.4) (9412.8, 18970.4) |
| 2016 | N = 5 14863.8 (3954.7) (10374.3, 20636.8) |
N = 5 13777.1 (3755.6) (9603.8, 19319.1) |
| 2017 | N = 5 14681.0 (4077.0) (10128.5, 20754.5) |
N = 5 13652.2 (3877.8) (9408.2, 19476.2) |
| 2018 | N = 5 14618.4 (3959.6) (10177.2, 20472.6) |
N = 5 13209.3 (3707.2) (9148.4, 18770.1) |
| 2019 | N = 5 14224.4 (3931.2) (9884.5, 20104.5) |
N = 5 13250.6 (3743.2) (9166.6, 18877.8) |
| 2021 | N = 5 14950.0 (4046.9) (10335.9, 20824.5) |
N = 5 13344.5 (3791.7) (9215.3, 19044.9) |
| 2022 | N = 5 14204.9 (4006.2) (9797.8, 20235.4) |
N = 5 13053.7 (3640.3) (9147.6, 18522.8) |
| 2023 | N = 5 14144.0 (3915.2) (9825.9, 19986.2) |
N = 5 12793.7 (3627.6) (8957.7, 18331.0) |
#NYC
NYC<- final_data %>% filter(marathon=="NYC")
#Summarize
summary_long <- NYC %>%
group_by(year, gender) %>%
summarise(
N=n(),
mean_time=mean(avg_chip_seconds, na.rm = TRUE),
sd_time=sd(avg_chip_seconds, na.rm = TRUE),
min_time=min(avg_chip_seconds, na.rm = TRUE),
max_time=max(avg_chip_seconds, na.rm = TRUE),
.groups="drop") %>%
mutate(mean_sd= sprintf("%.1f (%.1f)", mean_time, sd_time),
min_max= sprintf("(%.1f, %.1f)", min_time, max_time))
#Pivot wider — ensure one row per year
summary_wide <- summary_long %>%
pivot_wider(id_cols=year,
names_from=gender,
values_from=c(N, mean_sd, min_max),
names_glue="{gender}_{.value}") %>% arrange(year)
#Combine into one column per gender for GT
summary_combined <- summary_wide %>%
mutate(Female=paste0("N = ", female_N, "<br>", female_mean_sd, "<br>", female_min_max),
Male=paste0("N = ", male_N, "<br>", male_mean_sd, "<br>", male_min_max)) %>%
select(year, Female, Male)
#GT table
summary_combined %>%
gt(rowname_col = "year") %>%
cols_label(Female = "Female",
Male = "Male") %>%
fmt_markdown(columns = c(Female, Male)) %>%
tab_options(
data_row.padding = px(3),
table.font.size = px(14))
| Female | Male | |
|---|---|---|
| 1996 | N = 5 15480.0 (4091.6) (10744.5, 21404.8) |
N = 5 13539.8 (3521.9) (9491.6, 18605.9) |
| 1997 | N = 5 15490.5 (4135.8) (10618.2, 21418.5) |
N = 5 13409.2 (3534.7) (9391.0, 18533.7) |
| 1998 | N = 5 15151.7 (4068.9) (10368.5, 20988.1) |
N = 5 13448.8 (3572.5) (9364.3, 18624.2) |
| 1999 | N = 5 15173.1 (4065.5) (10439.8, 21028.8) |
N = 5 13513.6 (3572.0) (9456.2, 18694.7) |
| 2000 | N = 5 15230.7 (4108.6) (10399.9, 21145.8) |
N = 5 13604.2 (3609.7) (9493.9, 18852.2) |
| 2001 | N = 5 15038.9 (4008.2) (10315.4, 20749.4) |
N = 5 13363.7 (3561.8) (9282.4, 18505.0) |
| 2002 | N = 5 15240.3 (4067.8) (10485.5, 21096.4) |
N = 5 13410.2 (3604.6) (9271.8, 18631.9) |
| 2003 | N = 5 14887.2 (4042.9) (10024.7, 20618.9) |
N = 5 13675.2 (3681.7) (9429.6, 19006.4) |
| 2004 | N = 5 14983.6 (4113.3) (10126.6, 20870.8) |
N = 5 13596.7 (3741.7) (9347.2, 19060.7) |
| 2005 | N = 5 15094.9 (4099.7) (10223.7, 20923.8) |
N = 5 13622.0 (3691.9) (9492.5, 19039.0) |
| 2006 | N = 5 15134.6 (3983.2) (10513.8, 20866.7) |
N = 5 13569.5 (3543.2) (9554.9, 18693.3) |
| 2007 | N = 5 14947.6 (3939.6) (10374.6, 20598.1) |
N = 5 13516.8 (3538.0) (9543.4, 18669.0) |
| 2008 | N = 5 15046.0 (4009.1) (10407.8, 20838.7) |
N = 5 13475.4 (3552.6) (9499.2, 18662.3) |
| 2009 | N = 5 15499.3 (3995.6) (10928.3, 21275.4) |
N = 5 13510.7 (3559.3) (9479.3, 18690.0) |
| 2010 | N = 5 15420.0 (4046.1) (10689.4, 21220.9) |
N = 5 13433.7 (3606.3) (9371.1, 18712.8) |
| 2011 | N = 5 14954.4 (3909.6) (10428.1, 20574.5) |
N = 5 13150.9 (3533.9) (9207.0, 18347.3) |
| 2013 | N = 5 15116.0 (3982.7) (10428.6, 20820.1) |
N = 5 13458.7 (3573.3) (9433.0, 18694.3) |
| 2014 | N = 5 15195.2 (3984.7) (10624.2, 20979.5) |
N = 5 13735.8 (3663.3) (9610.4, 19104.0) |
| 2015 | N = 5 15117.5 (3998.5) (10511.6, 20886.2) |
N = 5 13710.8 (3667.8) (9640.4, 19095.7) |
| 2016 | N = 5 15140.2 (3992.8) (10580.3, 20926.3) |
N = 5 13448.2 (3638.7) (9415.8, 18810.1) |
| 2017 | N = 5 15362.8 (4076.9) (10719.8, 21286.7) |
N = 5 13756.3 (3692.9) (9709.8, 19217.6) |
| 2018 | N = 5 15023.2 (4024.8) (10501.2, 20934.1) |
N = 5 13309.7 (3692.0) (9296.6, 18833.4) |
| 2019 | N = 5 15014.9 (4074.3) (10466.8, 21020.9) |
N = 5 13515.4 (3722.1) (9487.9, 19072.5) |
| 2021 | N = 5 15040.2 (4190.0) (10343.5, 21256.2) |
N = 5 13573.8 (3880.4) (9388.0, 19433.1) |
| 2022 | N = 5 15105.3 (4195.9) (10336.7, 21273.0) |
N = 5 13643.0 (3825.2) (9491.7, 19379.1) |
| 2023 | N = 5 15487.5 (4153.5) (10910.2, 21625.2) |
N = 5 13256.4 (3758.3) (9245.1, 18953.4) |
| 2024 | N = 5 15185.0 (4150.4) (10547.0, 21319.6) |
N = 5 13460.1 (3761.6) (9416.5, 19120.1) |
###############ALL to be merged manually
#Boston
boston<- final_data %>% filter(marathon=="Boston")
summary_all <- boston %>%
group_by(year) %>%
summarise(
N=n(),
mean_time=mean(avg_chip_seconds, na.rm = TRUE),
sd_time=sd(avg_chip_seconds, na.rm = TRUE),
min_time=min(avg_chip_seconds, na.rm = TRUE),
max_time=max(avg_chip_seconds, na.rm = TRUE),
.groups="drop") %>%
mutate(
combined=paste0(
"N = ", N, "<br>",
sprintf("%.1f (%.1f)", mean_time, sd_time), "<br>",
sprintf("(%.1f, %.1f)", min_time, max_time))) %>%
select(year, combined)
summary_all %>%
gt(rowname_col = "year") %>%
cols_label(combined="All Runners") %>%
fmt_markdown(columns="combined") %>%
tab_options(data_row.padding = px(3),table.font.size = px(14))
| All Runners | |
|---|---|
| 1996 | N = 10 14344.6 (3591.1) (9553.1, 20672.9) |
| 1997 | N = 10 13530.5 (2931.7) (9535.2, 17793.1) |
| 1998 | N = 10 13716.6 (3197.4) (9425.3, 18893.5) |
| 1999 | N = 10 13999.2 (3246.9) (9509.8, 19272.6) |
| 2000 | N = 10 14087.1 (3224.8) (9622.5, 19436.7) |
| 2001 | N = 10 13964.2 (3219.2) (9546.6, 19216.4) |
| 2002 | N = 10 13741.2 (3072.9) (9460.1, 18574.5) |
| 2003 | N = 10 14135.0 (3253.4) (9650.1, 19605.2) |
| 2004 | N = 10 14156.4 (3319.4) (9555.5, 19535.2) |
| 2005 | N = 10 14162.5 (3188.2) (9690.4, 19230.4) |
| 2006 | N = 10 13888.0 (3217.2) (9420.5, 19257.0) |
| 2007 | N = 10 14547.7 (3385.8) (9879.1, 20232.7) |
| 2008 | N = 10 14053.8 (3309.8) (9430.7, 19738.4) |
| 2009 | N = 10 14378.7 (3441.8) (9487.4, 20372.2) |
| 2010 | N = 10 13981.8 (3353.1) (9255.5, 19803.8) |
| 2011 | N = 10 13684.9 (3326.3) (9014.7, 19502.2) |
| 2012 | N = 10 14773.6 (3676.7) (9764.6, 21181.7) |
| 2014 | N = 10 13935.6 (3481.0) (9529.5, 19797.4) |
| 2015 | N = 10 14114.9 (3356.7) (9615.3, 19804.4) |
| 2016 | N = 10 14493.6 (3382.9) (9905.1, 20215.0) |
| 2017 | N = 10 14008.9 (3305.8) (9610.4, 19499.9) |
| 2018 | N = 10 15211.9 (3748.0) (10131.2, 21729.7) |
| 2019 | N = 10 14009.6 (3385.3) (9468.8, 19750.8) |
#Berlin
berlin<- final_data %>% filter(marathon=="Berlin")
summary_all <- berlin %>%
group_by(year) %>%
summarise(
N=n(),
mean_time=mean(avg_chip_seconds, na.rm = TRUE),
sd_time=sd(avg_chip_seconds, na.rm = TRUE),
min_time=min(avg_chip_seconds, na.rm = TRUE),
max_time=max(avg_chip_seconds, na.rm = TRUE),
.groups="drop") %>%
mutate(
combined=paste0(
"N = ", N, "<br>",
sprintf("%.1f (%.1f)", mean_time, sd_time), "<br>",
sprintf("(%.1f, %.1f)", min_time, max_time))) %>%
select(year, combined)
summary_all %>%
gt(rowname_col = "year") %>%
cols_label(combined="All Runners") %>%
fmt_markdown(columns="combined") %>%
tab_options(data_row.padding = px(3),table.font.size = px(14))
| All Runners | |
|---|---|
| 1996 | N = 10 13978.5 (3136.9) (9468.1, 19053.5) |
| 1997 | N = 10 13809.7 (3181.7) (9267.9, 18986.8) |
| 1998 | N = 10 13911.1 (3272.6) (9265.4, 19463.4) |
| 1999 | N = 10 13668.8 (3176.9) (9235.9, 18792.8) |
| 2000 | N = 10 14045.6 (3367.5) (9380.3, 19680.0) |
| 2001 | N = 10 13779.6 (3153.2) (9430.4, 18813.7) |
| 2002 | N = 10 13794.7 (3191.7) (9316.1, 19074.9) |
| 2003 | N = 10 13955.9 (3395.2) (9187.7, 19812.7) |
| 2004 | N = 10 13729.4 (3224.2) (9311.8, 19041.5) |
| 2005 | N = 10 13803.2 (3249.3) (9418.8, 19105.7) |
| 2006 | N = 10 13858.7 (3333.8) (9308.0, 19437.7) |
| 2007 | N = 10 13810.2 (3363.7) (9088.5, 19600.3) |
| 2008 | N = 10 13625.8 (3261.5) (9157.7, 19126.5) |
| 2009 | N = 10 13988.0 (3394.6) (9268.8, 19778.9) |
| 2010 | N = 10 13885.0 (3348.3) (9200.4, 19717.1) |
| 2011 | N = 10 13638.9 (3357.7) (9043.1, 19335.9) |
| 2012 | N = 10 13715.7 (3297.6) (9167.5, 19355.7) |
| 2013 | N = 10 13694.4 (3316.1) (9085.9, 19393.9) |
| 2014 | N = 10 13655.3 (3370.1) (9019.4, 19452.1) |
| 2015 | N = 10 13651.2 (3365.1) (9063.2, 19376.8) |
| 2016 | N = 10 13751.1 (3413.2) (9098.7, 19712.4) |
| 2017 | N = 10 13728.2 (3403.8) (9098.2, 19637.8) |
| 2018 | N = 10 13609.4 (3469.0) (9005.3, 19687.9) |
| 2019 | N = 5 12736.8 (3393.8) (8912.4, 17696.4) |
#Chicago
chicago<- final_data %>% filter(marathon=="Chicago")
summary_all <- chicago %>%
group_by(year) %>%
summarise(
N=n(),
mean_time=mean(avg_chip_seconds, na.rm = TRUE),
sd_time=sd(avg_chip_seconds, na.rm = TRUE),
min_time=min(avg_chip_seconds, na.rm = TRUE),
max_time=max(avg_chip_seconds, na.rm = TRUE),
.groups="drop") %>%
mutate(
combined=paste0(
"N = ", N, "<br>",
sprintf("%.1f (%.1f)", mean_time, sd_time), "<br>",
sprintf("(%.1f, %.1f)", min_time, max_time))) %>%
select(year, combined)
summary_all %>%
gt(rowname_col = "year") %>%
cols_label(combined="All Runners") %>%
fmt_markdown(columns="combined") %>%
tab_options(data_row.padding = px(3),table.font.size = px(14))
| All Runners | |
|---|---|
| 1996 | N = 10 14303.5 (3484.8) (9278.5, 20328.6) |
| 1997 | N = 10 14269.2 (3618.1) (9223.7, 20900.2) |
| 1998 | N = 10 14050.4 (3564.2) (9170.4, 20512.0) |
| 1999 | N = 10 14094.5 (3642.4) (9060.3, 20808.1) |
| 2000 | N = 10 13976.9 (3521.9) (9282.5, 20221.9) |
| 2001 | N = 10 13899.7 (3359.0) (9390.6, 19479.9) |
| 2002 | N = 10 13705.5 (3376.5) (9152.4, 19444.2) |
| 2003 | N = 10 14001.3 (3529.0) (9105.6, 20176.3) |
| 2004 | N = 10 14078.4 (3542.9) (9233.0, 20327.4) |
| 2005 | N = 10 13998.7 (3495.2) (9258.5, 20069.0) |
| 2006 | N = 10 13980.9 (3528.6) (9192.8, 20072.8) |
| 2007 | N = 10 14804.1 (3716.8) (9512.8, 20892.5) |
| 2008 | N = 10 14359.6 (3763.4) (9263.0, 21019.8) |
| 2009 | N = 10 14203.7 (3659.8) (9131.4, 20699.7) |
| 2010 | N = 10 14176.0 (3753.1) (9172.0, 20757.4) |
| 2011 | N = 10 14068.2 (3674.0) (9144.8, 20494.5) |
| 2012 | N = 10 13998.6 (3648.6) (9058.8, 20485.7) |
| 2013 | N = 10 13840.1 (3597.1) (8968.0, 20091.7) |
| 2014 | N = 10 14133.9 (3766.3) (8983.7, 20880.7) |
| 2015 | N = 10 14255.7 (3641.2) (9412.8, 20483.7) |
| 2016 | N = 10 14320.4 (3680.7) (9603.8, 20636.8) |
| 2017 | N = 10 14166.6 (3790.1) (9408.2, 20754.5) |
| 2018 | N = 10 13913.9 (3691.6) (9148.4, 20472.6) |
| 2019 | N = 10 13737.5 (3655.1) (9166.6, 20104.5) |
| 2021 | N = 10 14147.3 (3792.7) (9215.3, 20824.5) |
| 2022 | N = 10 13629.3 (3659.4) (9147.6, 20235.4) |
| 2023 | N = 10 13468.8 (3628.8) (8957.7, 19986.2) |
#NYC
NYC<- final_data %>% filter(marathon=="NYC")
summary_all <- NYC %>%
group_by(year) %>%
summarise(
N=n(),
mean_time=mean(avg_chip_seconds, na.rm = TRUE),
sd_time=sd(avg_chip_seconds, na.rm = TRUE),
min_time=min(avg_chip_seconds, na.rm = TRUE),
max_time=max(avg_chip_seconds, na.rm = TRUE),
.groups="drop") %>%
mutate(
combined=paste0(
"N = ", N, "<br>",
sprintf("%.1f (%.1f)", mean_time, sd_time), "<br>",
sprintf("(%.1f, %.1f)", min_time, max_time))) %>%
select(year, combined)
summary_all %>%
gt(rowname_col = "year") %>%
cols_label(combined="All Runners") %>%
fmt_markdown(columns="combined") %>%
tab_options(data_row.padding = px(3),table.font.size = px(14))
| All Runners | |
|---|---|
| 1996 | N = 10 14509.9 (3741.5) (9491.6, 21404.8) |
| 1997 | N = 10 14449.8 (3789.2) (9391.0, 21418.5) |
| 1998 | N = 10 14300.2 (3719.7) (9364.3, 20988.1) |
| 1999 | N = 10 14343.4 (3712.3) (9456.2, 21028.8) |
| 2000 | N = 10 14417.5 (3745.5) (9493.9, 21145.8) |
| 2001 | N = 10 14201.3 (3682.2) (9282.4, 20749.4) |
| 2002 | N = 10 14325.2 (3749.6) (9271.8, 21096.4) |
| 2003 | N = 10 14281.2 (3700.9) (9429.6, 20618.9) |
| 2004 | N = 10 14290.2 (3778.4) (9347.2, 20870.8) |
| 2005 | N = 10 14358.4 (3759.1) (9492.5, 20923.8) |
| 2006 | N = 10 14352.1 (3648.5) (9554.9, 20866.7) |
| 2007 | N = 10 14232.2 (3609.7) (9543.4, 20598.1) |
| 2008 | N = 10 14260.7 (3665.8) (9499.2, 20838.7) |
| 2009 | N = 10 14505.0 (3718.2) (9479.3, 21275.4) |
| 2010 | N = 10 14426.8 (3761.9) (9371.1, 21220.9) |
| 2011 | N = 10 14052.6 (3639.7) (9207.0, 20574.5) |
| 2013 | N = 10 14287.4 (3672.5) (9433.0, 20820.1) |
| 2014 | N = 10 14465.5 (3689.6) (9610.4, 20979.5) |
| 2015 | N = 10 14414.2 (3692.5) (9640.4, 20886.2) |
| 2016 | N = 10 14294.2 (3710.2) (9415.8, 20926.3) |
| 2017 | N = 10 14559.5 (3763.7) (9709.8, 21286.7) |
| 2018 | N = 10 14166.4 (3751.5) (9296.6, 20934.1) |
| 2019 | N = 10 14265.2 (3762.9) (9487.9, 21020.9) |
| 2021 | N = 10 14307.0 (3884.9) (9388.0, 21256.2) |
| 2022 | N = 10 14374.1 (3862.9) (9491.7, 21273.0) |
| 2023 | N = 10 14371.9 (3915.1) (9245.1, 21625.2) |
| 2024 | N = 10 14322.6 (3843.3) (9416.5, 21319.6) |
#BOX AND WHISKER
#BOSTON
ggplot(boston, aes(x = gender, y =avg_chip_seconds/ 3600, fill = gender)) +
geom_boxplot(outlier.alpha = 0.3) +
scale_y_continuous(name = "Chip Time") +
labs(title = "Boston Marathon Chip Times by Gender") +
theme_minimal() +
theme(legend.position = "none")
ggplot(boston, aes(x = "", y =avg_chip_seconds / 3600)) +
geom_boxplot(fill = "steelblue", outlier.alpha = 0.3) +
scale_y_continuous(name = "Chip Time") +
labs(title = "Boston Marathon Chip Times - All Runners") +
theme_minimal() +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
#BERLIN
ggplot(berlin, aes(x = gender, y =avg_chip_seconds/ 3600, fill = gender)) +
geom_boxplot(outlier.alpha = 0.3) +
scale_y_continuous(name = "Chip Time") +
labs(title = "Berlin Marathon Chip Times by Gender") +
theme_minimal() +
theme(legend.position = "none")
ggplot(berlin, aes(x = "", y =avg_chip_seconds / 3600)) +
geom_boxplot(fill = "steelblue", outlier.alpha = 0.3) +
scale_y_continuous(name = "Chip Time") +
labs(title = "Berlin Marathon Chip Times - All Runners") +
theme_minimal() +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
#CHICAGO
ggplot(chicago, aes(x = gender, y =avg_chip_seconds/ 3600, fill = gender)) +
geom_boxplot(outlier.alpha = 0.3) +
scale_y_continuous(name = "Chip Time") +
labs(title = "Chicago Marathon Chip Times by Gender") +
theme_minimal() +
theme(legend.position = "none")
ggplot(chicago, aes(x = "", y =avg_chip_seconds / 3600)) +
geom_boxplot(fill = "steelblue", outlier.alpha = 0.3) +
scale_y_continuous(name = "Chip Time") +
labs(title = "Chicago Marathon Chip Times - All Runners") +
theme_minimal() +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
#NYC
ggplot(NYC, aes(x = gender, y =avg_chip_seconds/ 3600, fill = gender)) +
geom_boxplot(outlier.alpha = 0.3) +
scale_y_continuous(name = "Chip Time") +
labs(title = "NYC Marathon Chip Times by Gender") +
theme_minimal() +
theme(legend.position = "none")
ggplot(NYC, aes(x = "", y =avg_chip_seconds / 3600)) +
geom_boxplot(fill = "steelblue", outlier.alpha = 0.3) +
scale_y_continuous(name = "Chip Time") +
labs(title = "NYC Marathon Chip Times - All Runners") +
theme_minimal() +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
Look at the full dataset for missing values (ZD, KB)
# Summarize missing values
missing_summary <- final_data %>%
summarise(across(everything(), ~sum(is.na(.))))
# Keep only columns with any missing values
missing_summary <- missing_summary %>%
select(where(~any(. > 0)))
# Display nicely in HTML
knitr::kable(missing_summary, caption = "Missing Values per Variable")
| visibility | co | pm10 | pm25 |
|---|---|---|---|
| 175 | 235 | 380 | 280 |
We can see that visability, co,
pm10, and pm25 all have alot of missing
values. This is due to the Berlin dataset.
Spit the data so Berlin is used as a second case study to show how the method performs with missing data (whether successful or not).
main_data <- final_data %>% filter(marathon != "Berlin")
berlin_data <- final_data %>% filter(marathon == "Berlin")
str(main_data)
## tibble [770 × 21] (S3: tbl_df/tbl/data.frame)
## $ n : int [1:770] 159 1057 3481 2393 1389 767 5095 8091 6697 6677 ...
## $ marathon : chr [1:770] "Boston" "Boston" "Boston" "Boston" ...
## $ year : int [1:770] 1996 1996 1996 1996 1996 1996 1996 1996 1996 1996 ...
## $ gender : chr [1:770] "female" "female" "female" "female" ...
## $ subgroup : Factor w/ 5 levels "elite","competitive",..: 1 2 3 4 5 1 2 3 4 5 ...
## $ avg_chip_seconds : num [1:770] 10661 12931 14792 17030 20673 ...
## $ high_temp : int [1:770] 45 45 45 45 45 45 45 45 45 45 ...
## $ low_temp : int [1:770] 36 36 36 36 36 36 36 36 36 36 ...
## $ avg_temp : num [1:770] 40.5 40.5 40.5 40.5 40.5 ...
## $ precipitation : num [1:770] 0 0 0 0 0 0 0 0 0 0 ...
## $ dew_point : num [1:770] 24.2 24.2 24.2 24.2 24.2 ...
## $ wind_speed : int [1:770] 20 20 20 20 20 20 20 20 20 20 ...
## $ visibility : num [1:770] 10 10 10 10 10 10 10 10 10 10 ...
## $ sea_level_pressure: num [1:770] 30.2 30.2 30.2 30.2 30.2 ...
## $ aqi : int [1:770] 69 69 69 69 69 69 69 69 69 69 ...
## $ main_pollutant : chr [1:770] "PM10" "PM10" "PM10" "PM10" ...
## $ co : num [1:770] 13 13 13 13 13 13 13 13 13 13 ...
## $ ozone : num [1:770] 41 41 41 41 41 41 41 41 41 41 ...
## $ pm10 : num [1:770] 69 69 69 69 69 69 69 69 69 69 ...
## $ pm25 : num [1:770] NA NA NA NA NA NA NA NA NA NA ...
## $ no2 : num [1:770] 34 34 34 34 34 34 34 34 34 34 ...
We can see the main_data with out berlin now has 770 obs. and 21 variables.
# Summarize missing values
missing_summary <- main_data %>%
summarise(across(everything(), ~sum(is.na(.))))
# Keep only columns with at least one missing value
missing_summary <- missing_summary %>%
select(where(~any(. > 0)))
# Display nicely in HTML
knitr::kable(missing_summary, caption = "Missing Values per Variable")
| pm10 | pm25 |
|---|---|
| 320 | 70 |
In the main_data, we can see that there are still 320 missing values for pm10 and 70 missing values for pm2.5.
Since PM10 has a lot of missing values still, we will drop that column completely.
main_data <- main_data %>%
select(-pm10)
Since PM2.5 is often the main pollutant, we decided to use KNN imputation to fill missing PM2.5 values because it predicts missing data using similar rows without assuming a specific parametric relationship, preserves variance, and works well for our relatively small dataset while using correlations with other environmental variables.
# Impute missing PM2.5 values using 5 nearest neighbors
main_data <- kNN(main_data, variable = "pm25", k = 5) # can change later to see which K gives best model performance
## n year avg_chip_seconds high_temp
## 37.00 1996.00 8957.67 44.00
## low_temp avg_temp precipitation dew_point
## 28.00 29.58 0.00 21.13
## wind_speed visibility sea_level_pressure aqi
## 6.00 10.00 29.13 33.00
## co ozone no2 n
## 2.00 21.00 8.00 14462.00
## year avg_chip_seconds high_temp low_temp
## 2024.00 21729.70 88.00 72.00
## avg_temp precipitation dew_point wind_speed
## 79.35 0.61 65.22 39.00
## visibility sea_level_pressure aqi co
## 20.00 30.52 119.00 56.00
## ozone no2
## 100.00 66.00
# remove pm25_imp
cols_to_remove <- c(
"pm25_imp"
)
main_data <- main_data[, !(names(main_data) %in% cols_to_remove)]
# Check that missing values are filled
summary(main_data$pm25)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26.00 47.00 53.00 56.83 65.00 119.00
We can see that there are no missing values now and we can get a clean summary of the data.
Convert categorical variables to factors: (KB)
main_data <- main_data %>%
mutate(subgroup = factor(subgroup),
gender = factor(gender),
marathon = factor(marathon),
main_pollutant = factor(main_pollutant))
str(main_data)
## 'data.frame': 770 obs. of 20 variables:
## $ n : int 159 1057 3481 2393 1389 767 5095 8091 6697 6677 ...
## $ marathon : Factor w/ 3 levels "Boston","Chicago",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 1996 1996 1996 1996 1996 1996 1996 1996 1996 1996 ...
## $ gender : Factor w/ 2 levels "female","male": 1 1 1 1 1 2 2 2 2 2 ...
## $ subgroup : Factor w/ 5 levels "elite","competitive",..: 1 2 3 4 5 1 2 3 4 5 ...
## $ avg_chip_seconds : num 10661 12931 14792 17030 20673 ...
## $ high_temp : int 45 45 45 45 45 45 45 45 45 45 ...
## $ low_temp : int 36 36 36 36 36 36 36 36 36 36 ...
## $ avg_temp : num 40.5 40.5 40.5 40.5 40.5 ...
## $ precipitation : num 0 0 0 0 0 0 0 0 0 0 ...
## $ dew_point : num 24.2 24.2 24.2 24.2 24.2 ...
## $ wind_speed : int 20 20 20 20 20 20 20 20 20 20 ...
## $ visibility : num 10 10 10 10 10 10 10 10 10 10 ...
## $ sea_level_pressure: num 30.2 30.2 30.2 30.2 30.2 ...
## $ aqi : int 69 69 69 69 69 69 69 69 69 69 ...
## $ main_pollutant : Factor w/ 4 levels "NO2","Ozone",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ co : num 13 13 13 13 13 13 13 13 13 13 ...
## $ ozone : num 41 41 41 41 41 41 41 41 41 41 ...
## $ pm25 : num 56 55 56 55 56 56 55 56 55 55 ...
## $ no2 : num 34 34 34 34 34 34 34 34 34 34 ...
We can see that the identifiers (subgroup, gender, marathon) and predictor (main_pollutant) were all converted to factors so that our models can properly recognized them.
# create interaction terms and convert supershoe to factor
main_data_fe <- main_data %>%
mutate(
supershoe = factor(ifelse(year >= 2018, 1, 0), levels = c(0, 1)),
temp_dew_interaction = avg_temp * dew_point,
temp_aqi_interaction = avg_temp * aqi,
temp_precip_interaction = avg_temp * precipitation,
temp_wind_interaction = avg_temp * wind_speed,
pm25_temp_interaction = pm25 * avg_temp,
dew_wind_interaction = dew_point * wind_speed,
pressure_temp_interaction = sea_level_pressure * avg_temp,
avg_temp_gender_interaction = avg_temp * as.numeric(gender == "male")
)
str(main_data_fe)
## 'data.frame': 770 obs. of 29 variables:
## $ n : int 159 1057 3481 2393 1389 767 5095 8091 6697 6677 ...
## $ marathon : Factor w/ 3 levels "Boston","Chicago",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 1996 1996 1996 1996 1996 1996 1996 1996 1996 1996 ...
## $ gender : Factor w/ 2 levels "female","male": 1 1 1 1 1 2 2 2 2 2 ...
## $ subgroup : Factor w/ 5 levels "elite","competitive",..: 1 2 3 4 5 1 2 3 4 5 ...
## $ avg_chip_seconds : num 10661 12931 14792 17030 20673 ...
## $ high_temp : int 45 45 45 45 45 45 45 45 45 45 ...
## $ low_temp : int 36 36 36 36 36 36 36 36 36 36 ...
## $ avg_temp : num 40.5 40.5 40.5 40.5 40.5 ...
## $ precipitation : num 0 0 0 0 0 0 0 0 0 0 ...
## $ dew_point : num 24.2 24.2 24.2 24.2 24.2 ...
## $ wind_speed : int 20 20 20 20 20 20 20 20 20 20 ...
## $ visibility : num 10 10 10 10 10 10 10 10 10 10 ...
## $ sea_level_pressure : num 30.2 30.2 30.2 30.2 30.2 ...
## $ aqi : int 69 69 69 69 69 69 69 69 69 69 ...
## $ main_pollutant : Factor w/ 4 levels "NO2","Ozone",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ co : num 13 13 13 13 13 13 13 13 13 13 ...
## $ ozone : num 41 41 41 41 41 41 41 41 41 41 ...
## $ pm25 : num 56 55 56 55 56 56 55 56 55 55 ...
## $ no2 : num 34 34 34 34 34 34 34 34 34 34 ...
## $ supershoe : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ temp_dew_interaction : num 978 978 978 978 978 ...
## $ temp_aqi_interaction : num 2792 2792 2792 2792 2792 ...
## $ temp_precip_interaction : num 0 0 0 0 0 0 0 0 0 0 ...
## $ temp_wind_interaction : num 809 809 809 809 809 ...
## $ pm25_temp_interaction : num 2266 2225 2266 2225 2266 ...
## $ dew_wind_interaction : num 483 483 483 483 483 ...
## $ pressure_temp_interaction : num 1222 1222 1222 1222 1222 ...
## $ avg_temp_gender_interaction: num 0 0 0 0 0 ...
Need to do a Quick correlation check for numeric variables and see what features inroduce multicollinearity.
# Find numeric columns after feature engineering, excluding 'year' and 'n' since they are idenifiers
numeric_vars <- main_data_fe %>%
select(where(is.numeric)) %>%
select(-year, -n) %>%
names()
numeric_vars
## [1] "avg_chip_seconds" "high_temp"
## [3] "low_temp" "avg_temp"
## [5] "precipitation" "dew_point"
## [7] "wind_speed" "visibility"
## [9] "sea_level_pressure" "aqi"
## [11] "co" "ozone"
## [13] "pm25" "no2"
## [15] "temp_dew_interaction" "temp_aqi_interaction"
## [17] "temp_precip_interaction" "temp_wind_interaction"
## [19] "pm25_temp_interaction" "dew_wind_interaction"
## [21] "pressure_temp_interaction" "avg_temp_gender_interaction"
We can see that there are a total 14 continuous variables,
disincluding the identifiers year and n.
Creat Correlation Matrix and make a visual: (KB)
cor_matrix <- cor(main_data_fe[numeric_vars], use = "pairwise.complete.obs")
# rounding
round(cor_matrix, 2)
## avg_chip_seconds high_temp low_temp avg_temp
## avg_chip_seconds 1.00 0.01 0.01 0.01
## high_temp 0.01 1.00 0.78 0.93
## low_temp 0.01 0.78 1.00 0.92
## avg_temp 0.01 0.93 0.92 1.00
## precipitation 0.00 0.12 0.07 0.08
## dew_point 0.01 0.74 0.86 0.83
## wind_speed 0.00 -0.05 -0.17 -0.13
## visibility 0.00 -0.01 -0.11 -0.06
## sea_level_pressure 0.02 -0.35 -0.32 -0.35
## aqi 0.00 0.47 0.32 0.38
## co 0.00 -0.02 -0.06 -0.02
## ozone 0.01 0.62 0.35 0.50
## pm25 0.00 0.43 0.33 0.37
## no2 -0.01 0.27 0.12 0.18
## temp_dew_interaction 0.01 0.85 0.93 0.94
## temp_aqi_interaction 0.01 0.77 0.66 0.74
## temp_precip_interaction 0.00 0.15 0.09 0.11
## temp_wind_interaction 0.01 0.37 0.21 0.30
## pm25_temp_interaction 0.01 0.75 0.68 0.74
## dew_wind_interaction 0.01 0.34 0.28 0.31
## pressure_temp_interaction 0.01 0.93 0.92 1.00
## avg_temp_gender_interaction -0.22 0.16 0.16 0.17
## precipitation dew_point wind_speed visibility
## avg_chip_seconds 0.00 0.01 0.00 0.00
## high_temp 0.12 0.74 -0.05 -0.01
## low_temp 0.07 0.86 -0.17 -0.11
## avg_temp 0.08 0.83 -0.13 -0.06
## precipitation 1.00 0.16 0.25 -0.03
## dew_point 0.16 1.00 0.01 -0.09
## wind_speed 0.25 0.01 1.00 -0.07
## visibility -0.03 -0.09 -0.07 1.00
## sea_level_pressure -0.19 -0.33 -0.08 -0.17
## aqi -0.20 0.36 -0.24 -0.01
## co -0.21 -0.01 -0.28 0.37
## ozone -0.02 0.39 0.25 -0.06
## pm25 -0.29 0.36 -0.33 0.06
## no2 -0.21 0.15 -0.28 0.18
## temp_dew_interaction 0.13 0.96 -0.07 -0.08
## temp_aqi_interaction -0.11 0.64 -0.22 -0.04
## temp_precip_interaction 0.99 0.18 0.24 -0.03
## temp_wind_interaction 0.30 0.33 0.89 -0.08
## pm25_temp_interaction -0.18 0.65 -0.29 0.01
## dew_wind_interaction 0.34 0.52 0.84 -0.09
## pressure_temp_interaction 0.07 0.83 -0.14 -0.07
## avg_temp_gender_interaction 0.01 0.14 -0.02 -0.01
## sea_level_pressure aqi co ozone pm25 no2
## avg_chip_seconds 0.02 0.00 0.00 0.01 0.00 -0.01
## high_temp -0.35 0.47 -0.02 0.62 0.43 0.27
## low_temp -0.32 0.32 -0.06 0.35 0.33 0.12
## avg_temp -0.35 0.38 -0.02 0.50 0.37 0.18
## precipitation -0.19 -0.20 -0.21 -0.02 -0.29 -0.21
## dew_point -0.33 0.36 -0.01 0.39 0.36 0.15
## wind_speed -0.08 -0.24 -0.28 0.25 -0.33 -0.28
## visibility -0.17 -0.01 0.37 -0.06 0.06 0.18
## sea_level_pressure 1.00 -0.10 -0.14 -0.04 -0.18 -0.14
## aqi -0.10 1.00 0.28 0.57 0.94 0.62
## co -0.14 0.28 1.00 -0.10 0.37 0.59
## ozone -0.04 0.57 -0.10 1.00 0.40 0.38
## pm25 -0.18 0.94 0.37 0.40 1.00 0.63
## no2 -0.14 0.62 0.59 0.38 0.63 1.00
## temp_dew_interaction -0.36 0.40 -0.01 0.46 0.40 0.18
## temp_aqi_interaction -0.22 0.89 0.19 0.68 0.84 0.53
## temp_precip_interaction -0.21 -0.18 -0.21 -0.02 -0.27 -0.18
## temp_wind_interaction -0.23 -0.07 -0.28 0.46 -0.16 -0.19
## pm25_temp_interaction -0.28 0.86 0.26 0.56 0.89 0.55
## dew_wind_interaction -0.26 -0.02 -0.25 0.42 -0.09 -0.15
## pressure_temp_interaction -0.29 0.38 -0.03 0.51 0.37 0.18
## avg_temp_gender_interaction -0.06 0.06 0.00 0.09 0.06 0.03
## temp_dew_interaction temp_aqi_interaction
## avg_chip_seconds 0.01 0.01
## high_temp 0.85 0.77
## low_temp 0.93 0.66
## avg_temp 0.94 0.74
## precipitation 0.13 -0.11
## dew_point 0.96 0.64
## wind_speed -0.07 -0.22
## visibility -0.08 -0.04
## sea_level_pressure -0.36 -0.22
## aqi 0.40 0.89
## co -0.01 0.19
## ozone 0.46 0.68
## pm25 0.40 0.84
## no2 0.18 0.53
## temp_dew_interaction 1.00 0.72
## temp_aqi_interaction 0.72 1.00
## temp_precip_interaction 0.16 -0.09
## temp_wind_interaction 0.31 0.10
## pm25_temp_interaction 0.73 0.97
## dew_wind_interaction 0.43 0.14
## pressure_temp_interaction 0.93 0.74
## avg_temp_gender_interaction 0.16 0.13
## temp_precip_interaction temp_wind_interaction
## avg_chip_seconds 0.00 0.01
## high_temp 0.15 0.37
## low_temp 0.09 0.21
## avg_temp 0.11 0.30
## precipitation 0.99 0.30
## dew_point 0.18 0.33
## wind_speed 0.24 0.89
## visibility -0.03 -0.08
## sea_level_pressure -0.21 -0.23
## aqi -0.18 -0.07
## co -0.21 -0.28
## ozone -0.02 0.46
## pm25 -0.27 -0.16
## no2 -0.18 -0.19
## temp_dew_interaction 0.16 0.31
## temp_aqi_interaction -0.09 0.10
## temp_precip_interaction 1.00 0.31
## temp_wind_interaction 0.31 1.00
## pm25_temp_interaction -0.16 0.03
## dew_wind_interaction 0.34 0.93
## pressure_temp_interaction 0.10 0.29
## avg_temp_gender_interaction 0.02 0.05
## pm25_temp_interaction dew_wind_interaction
## avg_chip_seconds 0.01 0.01
## high_temp 0.75 0.34
## low_temp 0.68 0.28
## avg_temp 0.74 0.31
## precipitation -0.18 0.34
## dew_point 0.65 0.52
## wind_speed -0.29 0.84
## visibility 0.01 -0.09
## sea_level_pressure -0.28 -0.26
## aqi 0.86 -0.02
## co 0.26 -0.25
## ozone 0.56 0.42
## pm25 0.89 -0.09
## no2 0.55 -0.15
## temp_dew_interaction 0.73 0.43
## temp_aqi_interaction 0.97 0.14
## temp_precip_interaction -0.16 0.34
## temp_wind_interaction 0.03 0.93
## pm25_temp_interaction 1.00 0.08
## dew_wind_interaction 0.08 1.00
## pressure_temp_interaction 0.74 0.30
## avg_temp_gender_interaction 0.13 0.05
## pressure_temp_interaction
## avg_chip_seconds 0.01
## high_temp 0.93
## low_temp 0.92
## avg_temp 1.00
## precipitation 0.07
## dew_point 0.83
## wind_speed -0.14
## visibility -0.07
## sea_level_pressure -0.29
## aqi 0.38
## co -0.03
## ozone 0.51
## pm25 0.37
## no2 0.18
## temp_dew_interaction 0.93
## temp_aqi_interaction 0.74
## temp_precip_interaction 0.10
## temp_wind_interaction 0.29
## pm25_temp_interaction 0.74
## dew_wind_interaction 0.30
## pressure_temp_interaction 1.00
## avg_temp_gender_interaction 0.17
## avg_temp_gender_interaction
## avg_chip_seconds -0.22
## high_temp 0.16
## low_temp 0.16
## avg_temp 0.17
## precipitation 0.01
## dew_point 0.14
## wind_speed -0.02
## visibility -0.01
## sea_level_pressure -0.06
## aqi 0.06
## co 0.00
## ozone 0.09
## pm25 0.06
## no2 0.03
## temp_dew_interaction 0.16
## temp_aqi_interaction 0.13
## temp_precip_interaction 0.02
## temp_wind_interaction 0.05
## pm25_temp_interaction 0.13
## dew_wind_interaction 0.05
## pressure_temp_interaction 0.17
## avg_temp_gender_interaction 1.00
#ploting
corrplot(
cor_matrix,
method = "color",
col = colorRampPalette(c("blue", "white", "red"))(200),
tl.cex = 0.6
)
Looking at the visual, we can see that a lot of these features were highly correlated and will have to be removed. For example, high_temp and low_temp are strongly correlated with avg_temp (0.93 and 0.92), and aqi overlaps with pm25 (0.94). Interaction terms like temp_dew_interaction, temp_precip_interaction, and pressure_temp_interaction show very high correlations with their constituent variables (up to 1.00), making them redundant. Similarly, main_pollutant is just a categorical version of aqi. Overall, these features provide little additional information and can be removed to reduce multicollinearity.
Looking at their distributions to see if we should keep or remove them.
table(main_data_fe$main_pollutant)
##
## NO2 Ozone PM10 PM2.5
## 80 110 10 570
table(main_data_fe$supershoe)
##
## 0 1
## 640 130
We can see that main_pollutant still contains data from pm10 which was removed. Also we know that main_pollutant is the categorical variable for aqi, and since aqi is highly correlated with pm2.5 (often the main_pollutant), we will drop aqi and main_pollutant all together.
Looking at the supershoes variable, there is an expected imbalance, with more observations of years without supershoes (0 = 640) and fewer with supershoes (1 = 130) since they were introduced more recently in 2018. We will keep this as a control variable, and if needed, techniques like weighting can be applied to help the imbalance.
Removing correlated variables as seen above: (KB)
cols_to_remove <- c(
"high_temp",
"low_temp",
"aqi",
"temp_dew_interaction",
"temp_precip_interaction",
"temp_wind_interaction",
"pm25_temp_interaction",
"dew_wind_interaction",
"pressure_temp_interaction",
"main_pollutant"
)
main_data_fe <- main_data_fe[, !(names(main_data_fe) %in% cols_to_remove)]
str(main_data_fe)
## 'data.frame': 770 obs. of 19 variables:
## $ n : int 159 1057 3481 2393 1389 767 5095 8091 6697 6677 ...
## $ marathon : Factor w/ 3 levels "Boston","Chicago",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 1996 1996 1996 1996 1996 1996 1996 1996 1996 1996 ...
## $ gender : Factor w/ 2 levels "female","male": 1 1 1 1 1 2 2 2 2 2 ...
## $ subgroup : Factor w/ 5 levels "elite","competitive",..: 1 2 3 4 5 1 2 3 4 5 ...
## $ avg_chip_seconds : num 10661 12931 14792 17030 20673 ...
## $ avg_temp : num 40.5 40.5 40.5 40.5 40.5 ...
## $ precipitation : num 0 0 0 0 0 0 0 0 0 0 ...
## $ dew_point : num 24.2 24.2 24.2 24.2 24.2 ...
## $ wind_speed : int 20 20 20 20 20 20 20 20 20 20 ...
## $ visibility : num 10 10 10 10 10 10 10 10 10 10 ...
## $ sea_level_pressure : num 30.2 30.2 30.2 30.2 30.2 ...
## $ co : num 13 13 13 13 13 13 13 13 13 13 ...
## $ ozone : num 41 41 41 41 41 41 41 41 41 41 ...
## $ pm25 : num 56 55 56 55 56 56 55 56 55 55 ...
## $ no2 : num 34 34 34 34 34 34 34 34 34 34 ...
## $ supershoe : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ temp_aqi_interaction : num 2792 2792 2792 2792 2792 ...
## $ avg_temp_gender_interaction: num 0 0 0 0 0 ...
Now we have a dataset that has 770 obs. and only 19 variables after the addition of newly engineered features and removal if highly correlated features.
Using the final merged dataset before feature engineering. There is no gender interaction for the sake of feature engineering investigation.
#(KB from merging and pre-feat-eng)
#Compute average finishing time per mararthon/year (no gender) (KB, MH)
avg_times_tree <- marathons_all %>% # (KB) switch to marathons_all dataset instead of runners dataset
group_by(marathon, year) %>%
summarize(
n = n(), # number of runners in each subgroup
avg_chip_seconds = mean(chip_seconds, na.rm = TRUE),
.groups = "drop"
)
# Left join weather_clean and airquailty_clean onto the cleaned marathon datasets (KB) (MH)
final_data <- avg_times_tree %>%
left_join(weather_clean, by = c("year", "marathon")) %>%
left_join(airquality_clean, by = c("year", "marathon"))
# Impute missing PM2.5 values using 5 nearest neighbors
final_data_tree <- kNN(final_data, variable = "pm25", k = 5) # can change later to see which K gives best model performance
## year n avg_chip_seconds high_temp
## 1996.00 8194.00 13266.03 44.00
## low_temp avg_temp precipitation dew_point
## 28.00 29.58 0.00 21.13
## wind_speed visibility sea_level_pressure aqi
## 3.00 6.06 29.13 11.00
## co ozone pm10 no2
## 2.00 7.00 5.00 7.00
## year n avg_chip_seconds high_temp
## 2024.00 55515.00 17604.39 88.00
## low_temp avg_temp precipitation dew_point
## 72.00 79.35 0.85 65.22
## wind_speed visibility sea_level_pressure aqi
## 39.00 20.00 30.54 119.00
## co ozone pm10 no2
## 56.00 100.00 69.00 66.00
# remove pm25_imp
cols_to_remove <- c(
"pm25_imp"
)
final_data_tree <- final_data_tree[, !(names(final_data_tree) %in% cols_to_remove)]
# create interaction terms and convert supershoe to factor (KB, MH)
final_data_tree <- final_data_tree %>%
mutate(
supershoe = factor(ifelse(year >= 2018, 1, 0), levels = c(0, 1)),
temp_dew_interaction = avg_temp * dew_point,
temp_aqi_interaction = avg_temp * aqi,
temp_precip_interaction = avg_temp * precipitation,
temp_wind_interaction = avg_temp * wind_speed,
pm25_temp_interaction = pm25 * avg_temp,
dew_wind_interaction = dew_point * wind_speed,
pressure_temp_interaction = sea_level_pressure * avg_temp
)
cols_to_remove <- c(
"high_temp",
"low_temp",
"aqi",
"temp_dew_interaction",
"temp_precip_interaction",
"temp_wind_interaction",
"pm25_temp_interaction",
"dew_wind_interaction",
"pressure_temp_interaction",
"main_pollutant",
"pm10"
)
final_data_tree <- final_data_tree[, !(names(final_data_tree) %in% cols_to_remove)]
# indicate and define year of covid (ZD)
final_data_tree <- final_data_tree %>%
mutate(
covid_era = ifelse(year == 2020, 1, 0)
)
#removed Berlin
final_data_tree <- final_data_tree %>% filter(marathon != "Berlin")
Add our 90/10 training/test split for the decision tree (ZD) (KB)
# split data into train and test
set.seed(123)
train_index_tree <- sample(1:nrow(final_data_tree), size = 0.9 * nrow(final_data_tree)) # use a 90/10 split
train_data_tree <- final_data_tree[train_index_tree, ]
test_data_tree <- final_data_tree[-train_index_tree, ]
Decision tree looking at weather and airquality features to consider breaking down into bins. reference: https://bradleyboehmke.github.io/HOML/DT.html (MH) (KB)
feature_check <- rpart(
avg_chip_seconds ~ avg_temp + precipitation + dew_point + wind_speed + visibility +
sea_level_pressure + co + ozone + pm25 + no2 + supershoe + temp_aqi_interaction + covid_era,
data = train_data_tree,
method = "anova"
)
# Plot showing important features and potential bins
rpart.plot(feature_check)
We can see that the decision tree shows that wind speed is the most important factor, splitting at 13 units. For higher wind speeds, ozone and temperature further influence the outcome. For lower wind speeds, temperature is the main driver.
Categorical bins from tree (TRAIN ONLY) (MH) (KB)
# wind speed bin
train_data_tree$wind_bin <- ifelse(train_data_tree$wind_speed >= 13,
"high_wind",
"low_wind")
# ozone bin
train_data_tree$ozone_bin <- ifelse(train_data_tree$ozone >= 38,
"high_ozone",
"low_ozone")
# temp bins (merging 'cool' and 'moderate', since moderate is a tiny bin)
train_data_tree$temp_bin <- cut(train_data_tree$avg_temp,
breaks = c(-Inf, 58, Inf), # merge 'cool' and 'moderate'
labels = c("cool_moderate", "warm"),
right = FALSE)
# pm2.5 bin
train_data_tree$pm25_bin <- ifelse(train_data_tree$pm25 >= 54,
"high_pm25",
"low_pm25")
# temp × aqi interaction bin
train_data_tree$temp_aqi_bin <- ifelse(train_data_tree$temp_aqi_interaction >= 2789,
"high_interaction",
"low_interaction")
Comparing models with continuous and binned data (TRAIN ONLY) (KB, MH)
# continuous-only model
model_cont <- lm(avg_chip_seconds ~ avg_temp + ozone + pm25 + wind_speed + temp_aqi_interaction,
data = train_data_tree)
# full binned model
model_bin <- lm(avg_chip_seconds ~ temp_bin + ozone_bin + pm25_bin + wind_bin + temp_aqi_bin,
data = train_data_tree)
# temperature binned with all others continuous
model_temp <- lm(avg_chip_seconds ~ temp_bin + ozone + pm25 + wind_speed + temp_aqi_interaction,
data = train_data_tree)
# temperature and ozone binned with all others continuous
model_temp_ozone <- lm(avg_chip_seconds ~ temp_bin + ozone_bin + pm25 + wind_speed + temp_aqi_interaction,
data = train_data_tree)
# air quality binned (ozone + pm25) with temperature continuous
model_airbin <- lm(avg_chip_seconds ~ avg_temp + ozone_bin + pm25_bin + wind_speed + temp_aqi_interaction,
data = train_data_tree)
# ozone binned only
model_ozone <- lm(avg_chip_seconds ~ avg_temp + ozone_bin + pm25 + wind_speed + temp_aqi_interaction,
data = train_data_tree)
# wind binned only
model_wind <- lm(avg_chip_seconds ~ avg_temp + ozone + pm25 + wind_bin + temp_aqi_interaction,
data = train_data_tree)
# wind + ozone + pm25 binned
model_wind_air <- lm(avg_chip_seconds ~ avg_temp + ozone_bin + pm25_bin + wind_bin + temp_aqi_interaction,
data = train_data_tree)
Comparing R2 and comparing AIC (KB, MH)
#Comparing R2
summary(model_cont)$r.squared
## [1] 0.4770668
summary(model_bin)$r.squared
## [1] 0.5411029
summary(model_temp)$r.squared
## [1] 0.4690846
summary(model_temp_ozone)$r.squared
## [1] 0.5248978
summary(model_airbin)$r.squared
## [1] 0.6100282
summary(model_ozone)$r.squared
## [1] 0.5774904
summary(model_wind)$r.squared
## [1] 0.4676609
summary(model_wind_air)$r.squared
## [1] 0.5919024
#Comparing AIC
AIC(model_cont)
## [1] 1140.996
AIC(model_bin)
## [1] 1131.983
AIC(model_temp)
## [1] 1142.041
AIC(model_temp_ozone)
## [1] 1134.377
AIC(model_airbin)
## [1] 1120.753
AIC(model_ozone)
## [1] 1126.282
AIC(model_wind)
## [1] 1142.226
AIC(model_wind_air)
## [1] 1123.888
Caret cross-validation results (TRAINING DATA) (KB, MH)
set.seed(123)
train_control <- trainControl(method = "cv", number = 10)
cv_cont <- train(avg_chip_seconds ~ avg_temp + ozone + pm25 + wind_speed + temp_aqi_interaction,
data = train_data_tree,
method = "lm",
trControl = train_control)
cv_bin <- train(avg_chip_seconds ~ temp_bin + ozone_bin + pm25_bin + wind_bin + temp_aqi_bin,
data = train_data_tree,
method = "lm",
trControl = train_control)
cv_temp <- train(avg_chip_seconds ~ temp_bin + ozone + pm25 + wind_speed + temp_aqi_interaction,
data = train_data_tree,
method = "lm",
trControl = train_control)
cv_temp_ozone <- train(avg_chip_seconds ~ temp_bin + ozone_bin + pm25 + wind_speed + temp_aqi_interaction,
data = train_data_tree,
method = "lm",
trControl = train_control)
cv_airbin <- train(avg_chip_seconds ~ avg_temp + ozone_bin + pm25_bin + wind_speed + temp_aqi_interaction,
data = train_data_tree,
method = "lm",
trControl = train_control)
cv_ozone <- train(avg_chip_seconds ~ avg_temp + ozone_bin + pm25 + wind_speed + temp_aqi_interaction,
data = train_data_tree,
method = "lm",
trControl = train_control)
cv_wind <- train(avg_chip_seconds ~ avg_temp + ozone + pm25 + wind_bin + temp_aqi_interaction,
data = train_data_tree,
method = "lm",
trControl = train_control)
cv_wind_air <- train(avg_chip_seconds ~ avg_temp + ozone_bin + pm25_bin + wind_bin + temp_aqi_interaction,
data = train_data_tree,
method = "lm",
trControl = train_control)
Getting the results (KB)
combined_results <- bind_rows(
cv_cont$results %>% mutate(Model = "cv_cont"),
cv_bin$results %>% mutate(Model = "cv_bin"),
cv_temp$results %>% mutate(Model = "cv_temp"),
cv_temp_ozone$results %>% mutate(Model = "cv_temp_ozone"),
cv_airbin$results %>% mutate(Model = "cv_airbin"),
cv_ozone$results %>% mutate(Model = "cv_ozone"),
cv_wind$results %>% mutate(Model = "cv_wind"),
cv_wind_air$results %>% mutate(Model = "cv_wind_air")
) %>%
select(Model, everything())
print(combined_results)
## Model intercept RMSE Rsquared MAE RMSESD RsquaredSD
## 1 cv_cont TRUE 895.8262 0.4873768 751.7258 237.5673 0.2541559
## 2 cv_bin TRUE 868.4068 0.5071339 709.1803 186.7728 0.1941312
## 3 cv_temp TRUE 896.0868 0.4642291 751.9601 182.3937 0.2025344
## 4 cv_temp_ozone TRUE 857.2706 0.5004747 706.8899 176.2001 0.2333458
## 5 cv_airbin TRUE 763.2415 0.5920959 635.5167 229.9983 0.2195542
## 6 cv_ozone TRUE 802.1434 0.6226780 672.7320 184.3297 0.2071750
## 7 cv_wind TRUE 930.0377 0.4669259 777.1032 222.1436 0.3023181
## 8 cv_wind_air TRUE 805.9406 0.6037733 671.5632 243.3696 0.1899028
## MAESD
## 1 214.7159
## 2 106.4979
## 3 180.0086
## 4 127.0137
## 5 171.8409
## 6 136.5862
## 7 209.5921
## 8 185.8482
We can see that the model with ozone and PM2.5 binned has the lowest RMSE and MAE, suggesting higher accuracy, and one of the highest R2 to explain variability. The model with just ozone binned has the highest R2, but also slightly higher RMSE and MAE.
Based on these results, we will incorporate the binned ozone and PM2.5 features into our final dataset for modeling and test a couple of initial models with the continuous version of these features, as well as the binned version.
LASSO is being used to answer the following question: Which version of the air-quality and temperature features should we keep for the final model?
Add our 90/10 training/test split for LASSO (KB, ZD)
# split data into train and test
set.seed(123)
train_index <- sample(1:nrow(main_data_fe), size = 0.9 * nrow(main_data_fe)) # use a 90/10 split
train_data_scaled <- main_data_fe[train_index, ]
test_data_scaled <- main_data_fe[-train_index, ]
Data Scaling for LASSO and later linear regression model (ZD, KB)
# Identify numeric predictors to scale (exclude outcome and identifiers)
numeric_vars <- train_data_scaled %>%
select(where(is.numeric)) %>%
select(-avg_chip_seconds, -year, -n) %>%
names()
# scale training data and save scaling parameters
train_scaled <- scale(train_data_scaled[numeric_vars])
train_data_scaled[paste0("scaled_", numeric_vars)] <- train_scaled
train_center <- attr(train_scaled, "scaled:center")
train_scale <- attr(train_scaled, "scaled:scale")
# Scale test data (only numeric vars that exist in test)
numeric_vars_test <- intersect(numeric_vars, names(test_data_scaled))
test_scaled <- scale(test_data_scaled[numeric_vars_test],
center = train_center[numeric_vars_test],
scale = train_scale[numeric_vars_test])
test_data_scaled[paste0("scaled_", numeric_vars_test)] <- test_scaled
summary(train_data_scaled[paste0("scaled_", numeric_vars)])
## scaled_avg_temp scaled_precipitation scaled_dew_point scaled_wind_speed
## Min. :-2.4665 Min. :-0.2376 Min. :-1.6105 Min. :-1.57367
## 1st Qu.:-0.6886 1st Qu.:-0.2376 1st Qu.:-0.7537 1st Qu.:-0.57955
## Median :-0.1931 Median :-0.2376 Median :-0.0240 Median :-0.08249
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.5735 3rd Qu.:-0.2376 3rd Qu.: 0.6771 3rd Qu.: 0.41458
## Max. : 2.8256 Max. : 6.1116 Max. : 2.4515 Max. : 3.89402
## scaled_visibility scaled_sea_level_pressure scaled_co
## Min. :-0.1146 Min. :-2.06242 Min. :-1.0106
## 1st Qu.:-0.1146 1st Qu.:-0.80425 1st Qu.:-0.6693
## Median :-0.1146 Median : 0.09045 Median :-0.4134
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.:-0.1146 3rd Qu.: 0.92924 3rd Qu.: 0.2691
## Max. : 8.7115 Max. : 1.82394 Max. : 3.5964
## scaled_ozone scaled_pm25 scaled_no2
## Min. :-1.3319 Min. :-1.8827 Min. :-2.41721
## 1st Qu.:-0.7161 1st Qu.:-0.6027 1st Qu.:-0.78842
## Median :-0.2849 Median :-0.2370 Median : 0.02597
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.2694 3rd Qu.: 0.4944 3rd Qu.: 0.59605
## Max. : 3.5336 Max. : 3.7858 Max. : 2.30628
## scaled_temp_aqi_interaction scaled_avg_temp_gender_interaction
## Min. :-1.2552 Min. :-0.9842
## 1st Qu.:-0.6619 1st Qu.:-0.9842
## Median :-0.2796 Median : 0.4232
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.1835 3rd Qu.: 0.8932
## Max. : 3.0146 Max. : 1.9318
We can see that all the continuous variables for the train and test
data, not including the identifying variables year and
n, were successfully scaled with means at 0 and standard
deviations of 1, showing that the standardization was correctly
applied.
Add Binned features based off Decision Tree (ozone_bin and pm25_bin) (KB)
# Create bins on original numeric columns
train_data_scaled$ozone_bin <- factor(ifelse(train_data_scaled$ozone >= 38, 1, 0), levels = c(0, 1))
train_data_scaled$pm25_bin <- factor(ifelse(train_data_scaled$pm25 >= 54, 1, 0), levels = c(0, 1))
test_data_scaled$ozone_bin <- factor(ifelse(test_data_scaled$ozone >= 38, 1, 0), levels = c(0, 1))
test_data_scaled$pm25_bin <- factor(ifelse(test_data_scaled$pm25 >= 54, 1, 0), levels = c(0, 1))
# Remove original numeric columns
train_data_scaled <- train_data_scaled %>%
select(-all_of(numeric_vars))
test_data_scaled <- test_data_scaled %>%
select(-all_of(numeric_vars_test))
We made the factor levels:
- ozone bin: 0 = low, 1 = high - pm25 bin: 0 = low, 1 = high
Prepare the predictor matrix (x) and outcome vector (y) (KB)
# Only using ozone and PM2.5 (continuous and binned)
x_train <- model.matrix(avg_chip_seconds ~ scaled_ozone + scaled_pm25 +
ozone_bin + pm25_bin, data = train_data_scaled)[,-1] # remove intercept
y_train <- train_data_scaled$avg_chip_seconds
x_test <- model.matrix(avg_chip_seconds ~ scaled_ozone + scaled_pm25 +
ozone_bin + pm25_bin, data = test_data_scaled)[,-1]
y_test <- test_data_scaled$avg_chip_seconds
Run LASSO with cross-validation to select lambda (KB)
set.seed(123)
lasso_cv <- cv.glmnet(x_train, y_train, alpha = 1, standardize = FALSE) # already scaled
# Find the best lambda
best_lambda <- lasso_cv$lambda.min
best_lambda
## [1] 24.4284
# Fit final LASSO model at best lambda
lasso_model <- glmnet(x_train, y_train, alpha = 1, lambda = best_lambda, standardize = FALSE)
# Check coefficients
coef(lasso_model)
## 5 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 1.422713e+04
## scaled_ozone 8.896372e-15
## scaled_pm25 .
## ozone_bin1 .
## pm25_bin1 .
# Predict on test set
preds <- predict(lasso_model, newx = x_test)
# Evaluate performance
rmse <- sqrt(mean((y_test - preds)^2))
mae <- mean(abs(y_test - preds))
rmse
## [1] 3383.16
mae
## [1] 2861.152
We can see that both binned features and scaled_pm25 were retained showing (.). We can also see that scaled_ozone shrank to nearly 0, which means it did not contribute to the predicting average chip time once the binned values were also included. The model achieved an RMSE of 3383 seconds and a MAE of 2861 seconds, suggesting that good predictive performance.
For a simpler and more interpretable model, we will drop the original numeric features and keep only the binned features based on LASSO results (KB)
# Remove scaled_ozone and scaled_pm25 from training and test
train_data_scaled <- train_data_scaled[, !(names(train_data_scaled) %in% c("scaled_ozone"))]
test_data_scaled <- test_data_scaled[, !(names(test_data_scaled) %in% c("scaled_ozone"))]
train_data_scaled <- train_data_scaled[, !(names(train_data_scaled) %in% c("scaled_pm25"))]
test_data_scaled <- test_data_scaled[, !(names(test_data_scaled) %in% c("scaled_pm25"))]
str(train_data_scaled)
## 'data.frame': 693 obs. of 19 variables:
## $ n : int 6244 3922 3423 290 883 4643 6002 2815 1587 692 ...
## $ marathon : Factor w/ 3 levels "Boston","Chicago",..: 2 2 1 3 1 1 2 1 2 1 ...
## $ year : int 2014 2019 2014 1998 2016 2007 2002 2019 1997 1997 ...
## $ gender : Factor w/ 2 levels "female","male": 1 1 2 2 1 2 2 2 1 1 ...
## $ subgroup : Factor w/ 5 levels "elite","competitive",..: 5 3 4 1 5 3 4 4 4 4 ...
## $ avg_chip_seconds : num 20881 13601 14941 9364 20215 ...
## $ supershoe : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 2 1 1 ...
## $ scaled_avg_temp : num 0.287 -0.588 -0.148 -0.121 -0.193 ...
## $ scaled_precipitation : num -0.238 -0.238 -0.238 -0.238 -0.238 ...
## $ scaled_dew_point : num 0.474 -0.574 -0.59 -0.127 -0.405 ...
## $ scaled_wind_speed : num 0.4146 1.243 -0.0825 0.746 0.746 ...
## $ scaled_visibility : num -0.115 -0.115 -0.115 -0.115 -0.115 ...
## $ scaled_sea_level_pressure : num -1.084 -1.643 1.237 0.314 0.957 ...
## $ scaled_co : num -0.413 -0.925 -0.584 1.208 -0.755 ...
## $ scaled_no2 : num 0.026 -0.87 -0.3 -0.381 0.189 ...
## $ scaled_temp_aqi_interaction : num -0.174 -0.833 0.485 -1.12 -0.28 ...
## $ scaled_avg_temp_gender_interaction: num -0.984 -0.984 0.904 0.913 -0.984 ...
## $ ozone_bin : Factor w/ 2 levels "0","1": 2 1 2 1 2 2 1 2 2 2 ...
## $ pm25_bin : Factor w/ 2 levels "0","1": 1 1 2 1 2 1 2 1 2 1 ...
str(test_data_scaled)
## 'data.frame': 77 obs. of 19 variables:
## $ n : int 159 3481 720 2236 2550 712 1386 2566 645 2817 ...
## $ marathon : Factor w/ 3 levels "Boston","Chicago",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 1996 1996 1998 1998 1998 2000 2002 2002 2004 2006 ...
## $ gender : Factor w/ 2 levels "female","male": 1 1 1 2 2 1 1 1 1 2 ...
## $ subgroup : Factor w/ 5 levels "elite","competitive",..: 1 3 2 2 3 4 2 3 2 2 ...
## $ avg_chip_seconds : num 10661 14792 12376 11093 12697 ...
## $ supershoe : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ scaled_avg_temp : num -1.31 -1.31 -0.442 -0.442 -0.442 ...
## $ scaled_precipitation : num -0.238 -0.238 -0.238 -0.238 -0.238 ...
## $ scaled_dew_point : num -1.33 -1.33 0.439 0.439 0.439 ...
## $ scaled_wind_speed : num 0.746 0.746 0.746 0.746 0.746 ...
## $ scaled_visibility : num -0.115 -0.115 -0.115 -0.115 -0.115 ...
## $ scaled_sea_level_pressure : num 0.957 0.957 0.342 0.342 0.342 ...
## $ scaled_co : num -0.0721 -0.0721 0.2691 0.2691 0.2691 ...
## $ scaled_no2 : num -0.3 -0.3 0.27 0.27 0.27 ...
## $ scaled_temp_aqi_interaction : num -0.288 -0.288 -0.601 -0.601 -0.601 ...
## $ scaled_avg_temp_gender_interaction: num -0.984 -0.984 -0.984 0.802 0.802 ...
## $ ozone_bin : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 1 1 2 2 ...
## $ pm25_bin : Factor w/ 2 levels "0","1": 2 2 1 1 1 2 2 2 2 1 ...
We can see that scaled ozone was successfully removed from both train and test data. Now this dataset is ready for our initial linear regression model.
Initial Model (MH) (NEED TO CHECK HERE)
# Get rid of this model since we are dropping scaledPM25? Not sure if we want to add other model here? Maybe for Ozone bin? Im not to sure... (KB)
#lm_initial_scaledPM25 <- lm(avg_chip_seconds ~ marathon + gender + subgroup + supershoe + scaled_avg_temp + scaled_precipitation + scaled_dew_point + scaled_wind_speed + scaled_visibility + scaled_sea_level_pressure + scaled_co + scaled_pm25 + ozone_bin + scaled_no2 + scaled_temp_aqi_interaction + scaled_avg_temp_gender_interaction, data = train_data_scaled)
#summary(lm_initial_scaledPM25)
#plot(lm_initial_scaledPM25)
lm_initial_PM25bin <- lm(avg_chip_seconds ~ marathon + gender + subgroup + supershoe + scaled_avg_temp + scaled_precipitation + scaled_dew_point + scaled_wind_speed + scaled_visibility + scaled_sea_level_pressure + scaled_co + pm25_bin + ozone_bin + scaled_no2 + scaled_temp_aqi_interaction + scaled_avg_temp_gender_interaction, data = train_data_scaled)
summary(lm_initial_PM25bin)
##
## Call:
## lm(formula = avg_chip_seconds ~ marathon + gender + subgroup +
## supershoe + scaled_avg_temp + scaled_precipitation + scaled_dew_point +
## scaled_wind_speed + scaled_visibility + scaled_sea_level_pressure +
## scaled_co + pm25_bin + ozone_bin + scaled_no2 + scaled_temp_aqi_interaction +
## scaled_avg_temp_gender_interaction, data = train_data_scaled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2021.98 -222.10 -9.13 227.41 1685.71
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10348.22 116.65 88.711 < 2e-16 ***
## marathonChicago 374.67 79.99 4.684 3.41e-06 ***
## marathonNYC 609.27 66.09 9.219 < 2e-16 ***
## gendermale -1690.84 174.11 -9.711 < 2e-16 ***
## subgroupcompetitive 1951.30 48.80 39.989 < 2e-16 ***
## subgroupaverage 3763.43 48.28 77.957 < 2e-16 ***
## subgrouprecreational 5901.72 48.43 121.872 < 2e-16 ***
## subgroupslow 9492.33 47.89 198.218 < 2e-16 ***
## supershoe1 -155.72 45.82 -3.399 0.000716 ***
## scaled_avg_temp -82.71 40.71 -2.032 0.042586 *
## scaled_precipitation 22.97 17.25 1.332 0.183383
## scaled_dew_point 107.82 30.38 3.550 0.000413 ***
## scaled_wind_speed 112.33 21.86 5.138 3.65e-07 ***
## scaled_visibility 79.96 17.39 4.599 5.08e-06 ***
## scaled_sea_level_pressure 119.25 28.93 4.122 4.22e-05 ***
## scaled_co -45.37 23.65 -1.918 0.055521 .
## pm25_bin1 89.66 45.56 1.968 0.049464 *
## ozone_bin1 184.35 50.40 3.657 0.000275 ***
## scaled_no2 -67.35 24.87 -2.708 0.006939 **
## scaled_temp_aqi_interaction 66.48 36.86 1.804 0.071749 .
## scaled_avg_temp_gender_interaction 55.64 88.36 0.630 0.529134
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 401 on 672 degrees of freedom
## Multiple R-squared: 0.9866, Adjusted R-squared: 0.9862
## F-statistic: 2478 on 20 and 672 DF, p-value: < 2.2e-16
plot(lm_initial_PM25bin)
Adding RMSE and MAE for looking at overfitting with test results below
(MH/ZD) (NEED TO CHECK HERE)
residuals_lm_initial_PM25bin <- lm_initial_PM25bin$residuals
residuals_lm_initial_PM25bin_rmse <- sqrt(mean(residuals_lm_initial_PM25bin^2))
residuals_lm_initial_PM25bin_mae <- mean(abs(residuals_lm_initial_PM25bin))
#residuals_lm_initial_scaledPM25 <- lm_initial_scaledPM25$residuals
#residuals_lm_initial_scaledPM25_rmse <- sqrt(mean(residuals_lm_initial_scaledPM25^2))
#residuals_lm_initial_scaledPM25_mae <- mean(abs(residuals_lm_initial_scaledPM25))
#model_compare_train <- data.frame(
# Model = c("Continuous PM2.5", "PM2.5 Bin"),
# RMSE = c(residuals_lm_initial_scaledPM25_rmse,
# residuals_lm_initial_PM25bin_rmse),
# MAE = c(residuals_lm_initial_scaledPM25_mae,
# residuals_lm_initial_PM25bin_mae),
# R2 = c(summary(lm_initial_scaledPM25)$r.squared,
# summary(lm_initial_PM25bin)$r.squared)
#)
model_compare_train <- data.frame(
Model = c("PM2.5 Bin"),
RMSE = c(residuals_lm_initial_PM25bin_rmse),
MAE = c(residuals_lm_initial_PM25bin_mae),
R2 = c(summary(lm_initial_PM25bin)$r.squared)
)
# Display nicely formatted table (NEED TO FIX THIS TITLE)
knitr::kable(model_compare_train, digits = 4,
caption = "Training Data Comparison: Continuous vs Binned PM2.5")
| Model | RMSE | MAE | R2 |
|---|---|---|---|
| PM2.5 Bin | 394.8929 | 297.1224 | 0.9866 |
More Modeling (ZD) (NEED TO FIX HERE)
# build a function
evaluate <- function(model, test_data) {
preds <- predict(model, newdata = test_data)
actual <- test_data$avg_chip_seconds
rmse <- sqrt(mean((preds - actual)^2))
mae <- mean(abs(preds - actual))
r2 <- cor(preds, actual)^2
return(list(RMSE = rmse, MAE = mae, R2 = r2))
}
# Evaluate Model 1 with scaled PM2.5
#results_scaledPM25 <- evaluate(lm_initial_scaledPM25, test_data)
# Evaluate Model 2 with PM2.5 bin
results_PM25bin <- evaluate(lm_initial_PM25bin, test_data_scaled)
Compare results (ZD) (NEED TO FIX HERE)
#model_compare <- data.frame(
# Model = c("Continuous PM2.5", "PM2.5 Bin"),
# RMSE = c(results_scaledPM25$RMSE, results_PM25bin$RMSE),
# MAE = c(results_scaledPM25$MAE, results_PM25bin$MAE),
# R2 = c(results_scaledPM25$R2, results_PM25bin$R2)
#)
#knitr::kable(model_compare, digits = 4,
# caption = "Comparison of Initial Models: Continuous vs Binned PM2.5")
model_compare <- data.frame(
Model = c("PM2.5 Bin"),
RMSE = c(results_PM25bin$RMSE),
MAE = c(results_PM25bin$MAE),
R2 = c(results_PM25bin$R2)
)
knitr::kable(model_compare, digits = 4,
caption = "Comparison of Initial Models: Binned PM2.5")
| Model | RMSE | MAE | R2 |
|---|---|---|---|
| PM2.5 Bin | 358.0687 | 274.0892 | 0.9892 |
Build residual plots (ZD)
plot(lm_initial_PM25bin, which = 1) # Residuals vs Fitted
plot(lm_initial_PM25bin, which = 2) # Q–Q plot
Add our 90/10 training/test split with no scaling (ZD)
# split data into train and test
set.seed(42)
train_index <- sample(1:nrow(main_data_fe), size = 0.9 * nrow(main_data_fe)) # use a 90/10 split
train_data <- main_data_fe[train_index, ]
test_data <- main_data_fe[-train_index, ]
Add Binned features based off Decision Tree (ozone_bin and pm25_bin) (KB)
# Training data
train_data$ozone_bin <- factor(ifelse(train_data$ozone >= 38, 1, 0), levels = c(0, 1))
train_data$pm25_bin <- factor(ifelse(train_data$pm25 >= 54, 1, 0), levels = c(0, 1))
# Test data using same thresholds
test_data$ozone_bin <- factor(ifelse(test_data$ozone >= 38, 1, 0), levels = c(0, 1))
test_data$pm25_bin <- factor(ifelse(test_data$pm25 >= 54, 1, 0), levels = c(0, 1))
We made the factor levels:
- ozone bin: 0 = low, 1 = high - pm25 bin: 0 = low, 1 = high
Remove ozone and pm25 based on LASSO results and keeping the ozone and opm25 bins based on decision tree (KB)
# Remove ozone and pm25 from training and test data
train_data <- train_data[, !(names(train_data) %in% c("ozone", "pm25"))]
test_data <- test_data[, !(names(test_data) %in% c("ozone", "pm25"))]
str(train_data)
## 'data.frame': 693 obs. of 19 variables:
## $ n : int 77 62 4657 1318 4199 269 5155 1603 4888 3163 ...
## $ marathon : Factor w/ 3 levels "Boston","Chicago",..: 3 2 1 1 1 1 3 1 1 2 ...
## $ year : int 2002 2005 2011 2003 2019 2010 2009 2000 2008 2003 ...
## $ gender : Factor w/ 2 levels "female","male": 1 1 1 1 2 2 1 2 2 1 ...
## $ subgroup : Factor w/ 5 levels "elite","competitive",..: 1 1 3 4 3 1 4 4 3 3 ...
## $ avg_chip_seconds : num 10485 10318 14201 16716 12785 ...
## $ avg_temp : num 41.3 53.5 52.6 47.3 55.8 ...
## $ precipitation : num 0 0 0 0 0.61 0 0 0 0 0 ...
## $ dew_point : num 21.1 42.5 27.7 37.7 49.3 ...
## $ wind_speed : int 13 18 25 25 26 18 9 21 13 16 ...
## $ visibility : num 10 10 10 10 10 10 10 10 10 10 ...
## $ sea_level_pressure : num 30 29.4 30 30.1 29.6 ...
## $ co : num 17 8 5 10 3 3 10 7 7 18 ...
## $ no2 : num 44 38 20 43 35 29 30 30 36 45 ...
## $ supershoe : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
## $ temp_aqi_interaction : num 2148 2622 2787 3358 2624 ...
## $ avg_temp_gender_interaction: num 0 0 0 0 55.8 ...
## $ ozone_bin : Factor w/ 2 levels "0","1": 1 1 2 2 2 2 1 2 2 1 ...
## $ pm25_bin : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 2 2 2 ...
str(test_data)
## 'data.frame': 77 obs. of 19 variables:
## $ n : int 767 261 2100 712 721 1561 293 798 3096 186 ...
## $ marathon : Factor w/ 3 levels "Boston","Chicago",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 1996 1999 2000 2000 2000 2001 2001 2002 2005 2007 ...
## $ gender : Factor w/ 2 levels "female","male": 2 2 1 1 2 1 2 2 1 1 ...
## $ subgroup : Factor w/ 5 levels "elite","competitive",..: 1 1 3 4 5 2 1 5 3 1 ...
## $ avg_chip_seconds : num 9553 9510 14452 16852 17823 ...
## $ avg_temp : num 40.5 51.1 43 43 43 ...
## $ precipitation : num 0 0 0 0 0 0 0 0 0 0 ...
## $ dew_point : num 24.2 38.4 30.4 30.4 30.4 ...
## $ wind_speed : int 20 15 21 21 21 15 15 15 13 39 ...
## $ visibility : num 10 10 10 10 10 10 10 10 10 10 ...
## $ sea_level_pressure : num 30.2 29.9 30.3 30.3 30.3 ...
## $ co : num 13 14 7 7 7 9 9 13 9 5 ...
## $ no2 : num 34 53 30 30 30 42 42 38 39 25 ...
## $ supershoe : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ temp_aqi_interaction : num 2792 2708 2365 2365 2365 ...
## $ avg_temp_gender_interaction: num 40.5 51.1 0 0 43 ...
## $ ozone_bin : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 1 2 2 ...
## $ pm25_bin : Factor w/ 2 levels "0","1": 2 1 2 2 2 2 2 2 2 1 ...
We can see that ozone and pm25 were successfully removed from both datasets and we now have the binned features.
Attempt for XGboost model containing the engineered features, binned features, and scaled features: (ZD)
# Remove identifiers before creating model matrix
# Convert dataset to numeric-only matrix
train_matrix <- model.matrix(
avg_chip_seconds ~ .,
data = train_data %>% select(-year, -n)
)
test_matrix <- model.matrix(
avg_chip_seconds ~ .,
data = test_data %>% select(-year, -n)
)
# Convert to DMatrix format for tuning, needed for cross validation
dtrain <- xgb.DMatrix(data = train_matrix, label = train_data$avg_chip_seconds)
dtest <- xgb.DMatrix(data = test_matrix, label = test_data$avg_chip_seconds)
Create an untuned XGBoost model (ZD)
xgb_base <- xgboost(
data = dtrain,
nrounds = 200,
objective = "reg:squarederror",
verbose = FALSE # mute iterations
)
pred_xgb <- predict(xgb_base, dtest)
rmse_xgb <- rmse(test_data$avg_chip_seconds, pred_xgb)
mae_xgb <- mae(test_data$avg_chip_seconds, pred_xgb)
c(RMSE = rmse_xgb, MAE = mae_xgb)
## RMSE MAE
## 184.0134 120.6324
set.seed(123)
# Zack's original attempt
xgb_cv_of <- xgb.cv(
data = dtrain,
nrounds = 2000,
nfold = 5,
objective = "reg:squarederror",
eta = 0.01,
max_depth = 4,
subsample = 0.8,
colsample_bytree = 0.8,
early_stopping_rounds = 50,
verbose = 0 # mute iterations
)
xgb_cv_of$best_iteration
## [1] 2000
# Train model
params_of <- list(
objective = "reg:squarederror",
eta = 0.01,
max_depth = 4,
subsample = 0.8,
colsample_bytree = 0.8
)
# Find best # of trees from cv
best_xgb_of <- xgb.train(
params = params_of,
data = dtrain,
nrounds = xgb_cv_of$best_iteration #2000
)
We can see that the XGBoost model performed best at 2000 trees according to cross-validation, with is hitting the max nrounds.
Checking to see if the model is overfitting (ZD, KB)
# Overfitting?
cv_rmse_of <- min(xgb_cv_of$evaluation_log$test_rmse_mean)
train_rmse_of <- min(xgb_cv_of$evaluation_log$train_rmse_mean)
data.frame(
Train_RMSE = train_rmse_of,
CV_RMSE = cv_rmse_of,
Test_RMSE = rmse_xgb
)
## Train_RMSE CV_RMSE Test_RMSE
## 1 85.79693 233.44 184.0134
We can see that the model is strongly overfitting because the train error (85) is alot lower than CV (184).
Do this by running a tuning loop.
To make sure our XGBoost model is as accurate and reliable as possible, we built a hyperparameter-tuning loop. We can use this instead of guessing which settings would give us the best performance. This loop is able to automatically test several combinations using 5-fold cross-validation. It then checks how well the model predicts on unseen data and records the best RMSE and number of boosting rounds. Then it selects the combination with the lowest cross-validated error. The tuning loop is a balanced model that is flexible enough to capture real patterns and its regularized enough to avoid memorizing all the noise.
Need to undo the #’s if want to run the long loop.
#params_grid <- expand.grid(
# max_depth = c(2, 3, 4),
# min_child_weight = c(3, 5, 10),
# subsample = c(0.7, 0.8),
# colsample_bytree = c(0.6, 0.8),
# eta = c(0.05, 0.1),
# gamma = c(0, 0.1),
# lambda = c(0.5, 1),
# alpha = c(0, 1)
#)
#results <- list()
#for (i in 1:nrow(params_grid)) {
# params <- list(
# objective = "reg:squarederror",
# max_depth = params_grid$max_depth[i],
# min_child_weight = params_grid$min_child_weight[i],
# subsample = params_grid$subsample[i],
# colsample_bytree = params_grid$colsample_bytree[i],
# eta = params_grid$eta[i],
# gamma = params_grid$gamma[i],
# lambda = params_grid$lambda[i],
# alpha = params_grid$alpha[i]
# )
# cv <- xgb.cv(
# params = params,
# data = dtrain,
# nrounds = 1500,
# nfold = 5,
# early_stopping_rounds = 30,
# verbose = 0
# )
# results[[i]] <- list(
# params = params,
# best_rmse = min(cv$evaluation_log$test_rmse_mean),
# best_iter = cv$best_iteration
# )
#}
Next extract the best hyperparameters (KB)
Need to undo the #’s if want to run the long loop and see the best parameters and nrounds.
# Find the index of the combination with the lowest CV RMSE
#best_index <- which.min(sapply(results, function(x) x$best_rmse))
# Get the best parameters
#best_params <- results[[best_index]]$params
#best_params
# Get the best number of boosting rounds
#best_nrounds <- results[[best_index]]$best_iter
#best_nrounds
This way you don’t have to run the tuning loop, since it takes up a lot of time (KB)
set.seed(42)
# Set best hyperparameters directly
best_params <- list(
objective = "reg:squarederror",
max_depth = 4,
min_child_weight = 10,
subsample = 0.8,
colsample_bytree = 0.8,
eta = 0.1,
gamma = 0,
lambda = 0.5,
alpha = 0
)
best_nrounds <- 1498 # from tuning
# Train final XGBoost model
xgb_model <- xgboost(
data = dtrain,
params = best_params,
nrounds = best_nrounds,
verbose = FALSE
)
# Predictions on train and test sets
pred_train <- predict(xgb_model, dtrain)
pred_test <- predict(xgb_model, dtest)
# Train metrics
rmse_train <- rmse(train_data$avg_chip_seconds, pred_train)
mae_train <- mae(train_data$avg_chip_seconds, pred_train)
rss_train <- sum((train_data$avg_chip_seconds - pred_train)^2)
tss_train <- sum((train_data$avg_chip_seconds - mean(train_data$avg_chip_seconds))^2)
r2_train <- 1 - rss_train / tss_train
# CV RMSE from tuning
cv_rmse <- 151.6136 # manually use the CV RMSE from previous results
# Test metrics
rmse_test <- rmse(test_data$avg_chip_seconds, pred_test)
mae_test <- mae(test_data$avg_chip_seconds, pred_test)
rss_test <- sum((test_data$avg_chip_seconds - pred_test)^2)
tss_test <- sum((test_data$avg_chip_seconds - mean(test_data$avg_chip_seconds))^2)
r2_test <- 1 - rss_test / tss_test
# Combine into a single table
model_eval <- data.frame(
Metric = c("Train RMSE", "Train MAE", "Train R2",
"CV RMSE",
"Test RMSE", "Test MAE", "Test R2"),
Value = c(rmse_train, mae_train, r2_train,
cv_rmse,
rmse_test, mae_test, r2_test)
)
# Display table
knitr::kable(model_eval, digits = 4, caption = "XGBoost (Engineered Features) Model Best Hyperparameters - Overfitting Check")
| Metric | Value |
|---|---|
| Train RMSE | 49.2031 |
| Train MAE | 36.1951 |
| Train R2 | 0.9998 |
| CV RMSE | 151.6136 |
| Test RMSE | 149.5008 |
| Test MAE | 100.9366 |
| Test R2 | 0.9982 |
NOTE Models may be slightly different with small fluctuations that are likely due to sampling variability.
After tuning, our XGBoost model with engineered features shows substantial improvement over the baseline, though care should still be taken when interpreting training performance metrics.
We can see that the model is not overfitting severely (Train RMSE = 40.44, Test RMSE = 144.10). The gap between train and test performance is smaller than in the original, untuned model (Train RMSE: ~86, Test RMSE: ~184), indicating a meaningful improvement in generalization.
Cross-validation error is reasonably close to test error (CV RMSE = 151.61, Test RMSE = 144.10), which is encouraging. This suggests that our 5-fold CV is reliable, the model generalizes well, and there is no obvious data leakage.
Both R² values indicate excellent fit (Train R² ≈ 0.9999, Test R² = 0.9983). The nearly perfect training R² is likely due to the engineered subgroup features capturing much of the variability, so it does not necessarily indicate overfitting.
Overall, the tuned model clearly outperforms the baseline (Baseline Test RMSE ≈ 184, Tuned Test RMSE ≈ 144.10), representing roughly a 20–25% reduction in test error. This demonstrates that the hyperparameter tuning and feature engineering were effective in improving predictive accuracy and stability.
# Variable importance amongst final XGBoosted model
importance <- xgb.importance(model = xgb_model)
print(importance)
## Feature Gain Cover Frequency
## <char> <num> <num> <num>
## 1: subgroupslow 6.348407e-01 0.0500358889 0.0516922639
## 2: subgrouprecreational 1.913778e-01 0.0325519894 0.0339874858
## 3: subgroupaverage 7.775428e-02 0.0237266238 0.0278014790
## 4: subgroupcompetitive 4.602279e-02 0.0244484263 0.0302189989
## 5: gendermale 3.128102e-02 0.0199346849 0.0604379977
## 6: avg_temp_gender_interaction 3.021160e-03 0.0848715226 0.0955631399
## 7: sea_level_pressure 2.253195e-03 0.0965370507 0.0937855518
## 8: avg_temp 2.116754e-03 0.1212104986 0.1125568828
## 9: wind_speed 1.998203e-03 0.0911142469 0.0768629124
## 10: dew_point 1.696448e-03 0.1067639263 0.0974118316
## 11: marathonNYC 1.628296e-03 0.0110359629 0.0157849829
## 12: temp_aqi_interaction 1.562867e-03 0.0948244723 0.0885238908
## 13: no2 1.341968e-03 0.0858332036 0.0780005688
## 14: co 1.252114e-03 0.1069397341 0.0854664391
## 15: supershoe1 6.611291e-04 0.0096907857 0.0089590444
## 16: marathonChicago 6.333144e-04 0.0053051859 0.0104522184
## 17: ozone_bin1 2.142759e-04 0.0093821935 0.0118031854
## 18: pm25_bin1 1.836953e-04 0.0076414487 0.0085324232
## 19: precipitation 1.014640e-04 0.0176442313 0.0118742890
## 20: visibility 5.848327e-05 0.0005079236 0.0002844141
## Feature Gain Cover Frequency
xgb.plot.importance(importance)
We can see that the subgroups are
No feature engineering or binning occurred on the main_data.
str(main_data)
## 'data.frame': 770 obs. of 20 variables:
## $ n : int 159 1057 3481 2393 1389 767 5095 8091 6697 6677 ...
## $ marathon : Factor w/ 3 levels "Boston","Chicago",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 1996 1996 1996 1996 1996 1996 1996 1996 1996 1996 ...
## $ gender : Factor w/ 2 levels "female","male": 1 1 1 1 1 2 2 2 2 2 ...
## $ subgroup : Factor w/ 5 levels "elite","competitive",..: 1 2 3 4 5 1 2 3 4 5 ...
## $ avg_chip_seconds : num 10661 12931 14792 17030 20673 ...
## $ high_temp : int 45 45 45 45 45 45 45 45 45 45 ...
## $ low_temp : int 36 36 36 36 36 36 36 36 36 36 ...
## $ avg_temp : num 40.5 40.5 40.5 40.5 40.5 ...
## $ precipitation : num 0 0 0 0 0 0 0 0 0 0 ...
## $ dew_point : num 24.2 24.2 24.2 24.2 24.2 ...
## $ wind_speed : int 20 20 20 20 20 20 20 20 20 20 ...
## $ visibility : num 10 10 10 10 10 10 10 10 10 10 ...
## $ sea_level_pressure: num 30.2 30.2 30.2 30.2 30.2 ...
## $ aqi : int 69 69 69 69 69 69 69 69 69 69 ...
## $ main_pollutant : Factor w/ 4 levels "NO2","Ozone",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ co : num 13 13 13 13 13 13 13 13 13 13 ...
## $ ozone : num 41 41 41 41 41 41 41 41 41 41 ...
## $ pm25 : num 56 55 56 55 56 56 55 56 55 55 ...
## $ no2 : num 34 34 34 34 34 34 34 34 34 34 ...
Add our 90/10 training/test split (ZD, KB)
# split data into train and test
set.seed(123)
train_index <- sample(1:nrow(main_data), size = 0.9 * nrow(main_data)) # use a 90/10 split
train_data_raw <- as.data.frame(main_data[train_index, ]) # making sure they are data frames
test_data_raw <- as.data.frame(main_data[-train_index, ])
Remove identifiers before creating model matrix (ZD, KB)
# Convert dataset to numeric-only matrix
train_matrix_raw <- model.matrix(
avg_chip_seconds ~ .,
data = train_data_raw %>% select(-year, -n)
)
test_matrix_raw <- model.matrix(
avg_chip_seconds ~ .,
data = test_data_raw %>% select(-year, -n)
)
# Convert to DMatrix format for tuning, needed for cross validation
dtrain_raw <- xgb.DMatrix(data = train_matrix_raw, label = train_data_raw$avg_chip_seconds)
dtest_raw <- xgb.DMatrix(data = test_matrix_raw, label = test_data_raw$avg_chip_seconds)
Creating the XGBoost Model(ZD, KB)
# Create an untuned XGBoost model
xgb_base_raw <- xgboost(
data = dtrain_raw,
nrounds = 200,
objective = "reg:squarederror",
verbose = FALSE # mute iterations
)
pred_xgb_raw <- predict(xgb_base_raw, dtest_raw)
rmse_xgb_raw <- rmse(test_data_raw$avg_chip_seconds, pred_xgb_raw)
mae_xgb_raw <- mae(test_data_raw$avg_chip_seconds, pred_xgb_raw)
c(RMSE = rmse_xgb_raw, MAE = mae_xgb_raw)
## RMSE MAE
## 163.0404 122.4257
ADD HERE
This represents the initial overfitting model prior to tuning (ZD, KB)
# Zack's original attempt
xgb_cv_raw_of <- xgb.cv(
data = dtrain_raw,
nrounds = 2000,
nfold = 5,
objective = "reg:squarederror",
eta = 0.01,
max_depth = 4,
subsample = 0.8,
colsample_bytree = 0.8,
early_stopping_rounds = 50,
verbose = 0 # mute iterations
)
xgb_cv_raw_of$best_iteration
## [1] 2000
# Train model
params_raw_of <- list(
objective = "reg:squarederror",
eta = 0.01,
max_depth = 4,
subsample = 0.8,
colsample_bytree = 0.8
)
# Find best # of trees from cv
best_xgb_raw_of <- xgb.train(
params = params_raw_of,
data = dtrain_raw,
nrounds = xgb_cv_raw_of$best_iteration
)
We can see that the XGBoost model performed best at 2000 trees according to cross-validation, with is hitting the max nrounds.
Checking to see if the model is overfitting (ZD, KB)
# Overfitting?
cv_rmse_raw_of <- min(xgb_cv_raw_of$evaluation_log$test_rmse_mean)
train_rmse_raw_of <- min(xgb_cv_raw_of$evaluation_log$train_rmse_mean)
data.frame(
Train_RMSE = train_rmse_raw_of,
CV_RMSE = cv_rmse_raw_of,
Test_RMSE = rmse_xgb_raw
)
## Train_RMSE CV_RMSE Test_RMSE
## 1 92.81347 233.9103 163.0404
We can see that the model is strongly overfitting because the train error (92) is a lot lower than CV (163).
Do this by first running a tuning loop on the raw features as done previously (KB)
To make sure our XGBoost raw features model is as accurate and reliable as possible, we built a hyperparameter-tuning loop. We can use this instead of guessing which settings would give us the best performance. This loop is able to automatically test several combinations using 5-fold cross-validation. It then checks how well the model predicts on unseen data and records the best RMSE and number of boosting rounds. Then it selects the combination with the lowest cross-validated error. The tuning loop is a balanced model that is flexible enough to capture real patterns and its regularized enough to avoid memorizing all the noise.
Need to undo the #’s if want to run the long loop.
#params_grid_raw <- expand.grid(
# max_depth = c(2, 3, 4),
# min_child_weight = c(3, 5, 10),
# subsample = c(0.7, 0.8),
# colsample_bytree = c(0.6, 0.8),
# eta = c(0.05, 0.1),
# gamma = c(0, 0.1),
# lambda = c(0.5, 1),
# alpha = c(0, 1)
#)
#results_raw <- list()
#for (i in 1:nrow(params_grid_raw)) {
# params <- list(
# objective = "reg:squarederror",
# max_depth = params_grid_raw$max_depth[i],
# min_child_weight = params_grid_raw$min_child_weight[i],
# subsample = params_grid_raw$subsample[i],
# colsample_bytree = params_grid_raw$colsample_bytree[i],
# eta = params_grid_raw$eta[i],
# gamma = params_grid_raw$gamma[i],
# lambda = params_grid_raw$lambda[i],
# alpha = params_grid_raw$alpha[i]
# )
# cv_raw <- xgb.cv(
# params = params,
# data = dtrain_raw,
# nrounds = 1500,
# nfold = 5,
# early_stopping_rounds = 30,
# verbose = 0
# )
# results_raw[[i]] <- list(
# params = params,
# best_rmse = min(cv_raw$evaluation_log$test_rmse_mean),
# best_iter = cv_raw$best_iteration
# )
#}
Next extract the best hyperparameters (KB)
Need to undo the #’s if want to run the long loop and see the best parameters and nrounds.
# Find the index of the combination with the lowest CV RMSE
#best_index_raw <- which.min(sapply(results_raw, function(x) x$best_rmse))
# Get the best parameters
#best_params_raw <- results_raw[[best_index_raw]]$params
#best_params_raw
# Get the best number of boosting rounds
#best_nrounds_raw <- results_raw[[best_index_raw]]$best_iter
#best_nrounds_raw
Running XGBoost raw feature model with the best hyperparameters explicitly added, so you no longer need any tuning loop since it takes up a lot of time (KB)
set.seed(42)
# Set best hyperparameters directly
best_params_raw <- list(
objective = "reg:squarederror",
max_depth = 4,
min_child_weight = 10,
subsample = 0.7,
colsample_bytree = 0.8,
eta = 0.1,
gamma = 0,
lambda = 1,
alpha = 0
)
best_nrounds_raw <- 1498 # optimal rounds from CV
# Train final XGBoost model
xgb_raw_model <- xgboost(
data = dtrain_raw,
params = best_params_raw,
nrounds = best_nrounds_raw,
verbose = FALSE
)
# Predictions on train and test sets
pred_train_raw <- predict(xgb_raw_model, dtrain_raw)
pred_test_raw <- predict(xgb_raw_model, dtest_raw)
# Train metrics (use train_data_raw)
rmse_train <- rmse(train_data_raw$avg_chip_seconds, pred_train_raw)
mae_train <- mae(train_data_raw$avg_chip_seconds, pred_train_raw)
rss_train <- sum((train_data_raw$avg_chip_seconds - pred_train_raw)^2)
tss_train <- sum((train_data_raw$avg_chip_seconds - mean(train_data_raw$avg_chip_seconds))^2)
r2_train <- 1 - rss_train / tss_train
# CV RMSE from previous tuning
cv_rmse <- 164.6655 # manually use CV RMSE from previous results
# Test metrics (use test_data_raw)
rmse_test <- rmse(test_data_raw$avg_chip_seconds, pred_test_raw)
mae_test <- mae(test_data_raw$avg_chip_seconds, pred_test_raw)
rss_test <- sum((test_data_raw$avg_chip_seconds - pred_test_raw)^2)
tss_test <- sum((test_data_raw$avg_chip_seconds - mean(test_data_raw$avg_chip_seconds))^2)
r2_test <- 1 - rss_test / tss_test
# Combine into a single table
model_eval <- data.frame(
Metric = c("Train RMSE", "Train MAE", "Train R2",
"CV RMSE",
"Test RMSE", "Test MAE", "Test R2"),
Value = c(rmse_train, mae_train, r2_train,
cv_rmse,
rmse_test, mae_test, r2_test)
)
# Display table
knitr::kable(
model_eval,
digits = 4,
caption = "XGBoost Raw-Feature Model Best Hyperparameters - Overfitting Check (Train MAE & R2 included)"
)
| Metric | Value |
|---|---|
| Train RMSE | 56.5368 |
| Train MAE | 42.5066 |
| Train R2 | 0.9997 |
| CV RMSE | 164.6655 |
| Test RMSE | 136.6817 |
| Test MAE | 101.9962 |
| Test R2 | 0.9983 |
NOTE Models may be slightly different with small fluctuations that are likely due to sampling variability.
After tuning, our XGBoost model shows substantial improvement over the baseline, but some caution is still warranted regarding overfitting, so care should be taken when interpreting training performance metrics.
We can see that the model is not overfitting severely, only to a reasonable extent (Train RMSE ≈ 56.54, Test RMSE ≈ 136.68). The gap between train and test performance is expected and much smaller than in the baseline model (Train RMSE ≈ 92.52, Test RMSE ≈ 163.04). This indicates that regularization and hyperparameter tuning have helped reduce the degree of overfitting.
We also can see that the cross-validation error is very similar to the test error (CV RMSE ≈ 164.67, Test RMSE ≈ 136.68). This is good because it means that the model is stable across folds, the cross-validation process is reliable, and the model is generalizing consistently. This also suggests there is no obvious data leakage.
In addition, both R² scores show that the model fits the data extremely well (Train R² ≈ 0.9997, Test R² ≈ 0.9983). The almost perfect R² is likely due to the subgroup structure and engineered features, rather than overfitting.
Overall, the tuned model clearly outperforms the baseline model. The baseline had a Test RMSE of about 163.04, whereas the tuned model achieves 136.68, which is a meaningful improvement in accuracy. This represents roughly a 16% reduction in test error, showing that the tuning loop successfully improved generalization and stability.
Check for variable importance (ZD, KB)
# Variable importance amongst final XGBoosted model
importance <- xgb.importance(model = xgb_raw_model)
print(importance)
## Feature Gain Cover Frequency
## <char> <num> <num> <num>
## 1: subgroupslow 6.599729e-01 0.054971991 0.053945906
## 2: subgrouprecreational 1.951557e-01 0.028817377 0.031048536
## 3: subgroupaverage 7.326884e-02 0.019354912 0.023860689
## 4: gendermale 2.847187e-02 0.043338985 0.095072249
## 5: subgroupcompetitive 2.327263e-02 0.017954712 0.022749166
## 6: wind_speed 2.027099e-03 0.064959041 0.061504261
## 7: low_temp 1.899354e-03 0.074800275 0.064616525
## 8: marathonNYC 1.869667e-03 0.008177140 0.011411634
## 9: sea_level_pressure 1.788071e-03 0.070884552 0.075287143
## 10: avg_temp 1.735640e-03 0.069116252 0.064542423
## 11: co 1.507200e-03 0.085714509 0.071952575
## 12: high_temp 1.469484e-03 0.064939125 0.067728788
## 13: dew_point 1.442495e-03 0.097706723 0.082030382
## 14: aqi 1.398285e-03 0.072047176 0.068692108
## 15: pm25 1.370697e-03 0.074975967 0.058836606
## 16: ozone 1.192775e-03 0.057082783 0.058540200
## 17: no2 1.121181e-03 0.068973992 0.060837347
## 18: marathonChicago 5.434311e-04 0.005526114 0.008373472
## 19: main_pollutantPM2.5 2.865347e-04 0.007181671 0.007780660
## 20: precipitation 1.569707e-04 0.007885861 0.006002223
## 21: main_pollutantOzone 4.920771e-05 0.005590842 0.005187106
## Feature Gain Cover Frequency
xgb.plot.importance(importance)
(ADD HERE)
# Comparison table of Raw vs Engineered XGBoost models
comparison_table <- data.frame(
Metric = c(
"Train RMSE", "Train MAE", "Train R²",
"CV RMSE",
"Test RMSE", "Test MAE", "Test R²"
),
Raw_Feature_Model = c(
56.5368, 42.5066, 0.9997,
164.6655,
136.6817, 101.9962, 0.9983
),
Engineered_Feature_Model = c(
49.2031, 36.1951, 0.9998,
151.6136,
149.5008, 100.9366, 0.9982
)
)
# Display table
knitr::kable(
comparison_table,
digits = 4,
caption = "Comparison of XGBoost Raw-Feature vs Engineered-Feature Models"
)
| Metric | Raw_Feature_Model | Engineered_Feature_Model |
|---|---|---|
| Train RMSE | 56.5368 | 49.2031 |
| Train MAE | 42.5066 | 36.1951 |
| Train R² | 0.9997 | 0.9998 |
| CV RMSE | 164.6655 | 151.6136 |
| Test RMSE | 136.6817 | 149.5008 |
| Test MAE | 101.9962 | 100.9366 |
| Test R² | 0.9983 | 0.9982 |
After evaluating both the raw-feature and engineered-feature XGBoost models, we can see some clear differences in performance/stability.
The engineered-feature model has a lower training error (Train RMSE ≈ 49.20, Train MAE ≈ 36.20) and slightly better cross-validation performance (CV RMSE ≈ 151.61) compared to the raw-feature model (Train RMSE ≈ 56.54, Train MAE ≈ 42.51, CV RMSE ≈ 164.67). This means that the engineered features help the model fit the training data more accurately and generalize more consistently across folds.
Interestingly, the raw-feature model shows slightly better test performance (Test RMSE ≈ 136.68, Test MAE ≈ 102.00) than the engineered-feature model (Test RMSE ≈ 149.50, Test MAE ≈ 100.94). However, its higher CV RMSE suggests that this improvement might be due to other things, possibly the variability in the specific train-test split. This makes it potentially less stable across different subsets of the data.
Both models have extremely high R2 on the training and test sets (≈0.998–0.999). This means that they are able to capture nearly all variability in the target. Overall, the engineered-feature model is recommended if stability and consistent generalization are priorities, while the raw-feature model may give slightly lower observed test error in this particular split but could be less reliable on new data.
Contains code ideas that were not used.
indicate and define year of covid (ZD) final_data <- final_data %>% mutate( covid_era = ifelse(year == 2020, 1, 0) )
replacing the numeric variables with missing data with mean (could do median if we keep this); most likely dropping though (ZD) numeric_vars <- final_data %>% select(where(is.numeric)) %>% names()
final_data <- final_data %>% mutate(across(all_of(numeric_vars), ~ifelse(is.na(.), mean(., na.rm = TRUE), .)))
create custom cutoff times for performance groups (ZD) final_data <- final_data %>% mutate( finish_hours = avg_chip_seconds / 3600, # think we are having all time in seconds, need fixing subgroup2 = case_when( finish_hours < 3 ~ “elite”, finish_hours < 3.75 ~ “competitive”, finish_hours < 4.75 ~ “average”, finish_hours < 5.75 ~ “recreational”, TRUE ~ “slow” ) )